Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cea16606ba | ||
|
|
a03f430d0e | ||
|
|
f3f7c215a7 | ||
|
|
c75302bdb8 |
+2
-2
@@ -4,7 +4,7 @@ name: test
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: test
|
- name: test
|
||||||
image: golang:1.20
|
image: golang:1.17
|
||||||
environment:
|
environment:
|
||||||
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
|
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
|
||||||
commands:
|
commands:
|
||||||
@@ -30,7 +30,7 @@ trigger:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: build all binaries
|
- name: build all binaries
|
||||||
image: golang:1.20
|
image: golang:1.17
|
||||||
environment:
|
environment:
|
||||||
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
|
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
|
||||||
commands:
|
commands:
|
||||||
|
|||||||
+2
-2
@@ -1,11 +1,11 @@
|
|||||||
ARG REPO=library
|
ARG REPO=library
|
||||||
FROM ${REPO}/alpine:3.18
|
FROM ${REPO}/alpine:3.17
|
||||||
|
|
||||||
RUN mkdir /app
|
RUN mkdir /app
|
||||||
WORKDIR /app/
|
WORKDIR /app/
|
||||||
|
|
||||||
# Add common checking tools
|
# Add common checking tools
|
||||||
RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c
|
RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
|
||||||
|
|
||||||
# Add minitor user for running as non-root
|
# Add minitor user for running as non-root
|
||||||
RUN addgroup -S minitor && adduser -S minitor -G minitor
|
RUN addgroup -S minitor && adduser -S minitor -G minitor
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
ARG REPO=library
|
ARG REPO=library
|
||||||
FROM golang:1.20 AS builder
|
FROM golang:1.17 AS builder
|
||||||
|
|
||||||
RUN mkdir /app
|
RUN mkdir /app
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
@@ -14,7 +14,7 @@ ARG VERSION=dev
|
|||||||
ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH}
|
ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH}
|
||||||
RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor .
|
RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor .
|
||||||
|
|
||||||
FROM ${REPO}/alpine:3.18
|
FROM ${REPO}/alpine:3.17
|
||||||
RUN mkdir /app
|
RUN mkdir /app
|
||||||
WORKDIR /app/
|
WORKDIR /app/
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ WORKDIR /app/
|
|||||||
COPY --from=builder /app/minitor .
|
COPY --from=builder /app/minitor .
|
||||||
|
|
||||||
# Add common checking tools
|
# Add common checking tools
|
||||||
RUN apk --no-cache add bash=~5 curl=~8 jq=~1 bind-tools=~9 tzdata~=2023c
|
RUN apk --no-cache add bash=~5 curl=~8 jq=~1.6 bind-tools~=9
|
||||||
|
|
||||||
# Add minitor user for running as non-root
|
# Add minitor user for running as non-root
|
||||||
RUN addgroup -S minitor && adduser -S minitor -G minitor
|
RUN addgroup -S minitor && adduser -S minitor -G minitor
|
||||||
|
|||||||
@@ -46,8 +46,6 @@ docker run -v $PWD/config.yml:/app/config.yml iamthefij/minitor-go:latest
|
|||||||
|
|
||||||
Images are provided for `amd64`, `arm`, and `arm64` architechtures.
|
Images are provided for `amd64`, `arm`, and `arm64` architechtures.
|
||||||
|
|
||||||
Timezone configuration for the container is set by passing the `TZ` env variable. Eg. `TZ=America/Los_Angeles`.
|
|
||||||
|
|
||||||
## Configuring
|
## Configuring
|
||||||
|
|
||||||
In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file.
|
In this repo, you can explore the `sample-config.yml` file for an example, but the general structure is as follows. It should be noted that environment variable interpolation happens on load of the YAML file.
|
||||||
@@ -96,28 +94,10 @@ Also, when alerts are executed, they will be passed through Go's format function
|
|||||||
|`{{.AlertCount}}`|Number of times this monitor has alerted|
|
|`{{.AlertCount}}`|Number of times this monitor has alerted|
|
||||||
|`{{.FailureCount}}`|The total number of sequential failed checks for this monitor|
|
|`{{.FailureCount}}`|The total number of sequential failed checks for this monitor|
|
||||||
|`{{.LastCheckOutput}}`|The last returned value from the check command to either stderr or stdout|
|
|`{{.LastCheckOutput}}`|The last returned value from the check command to either stderr or stdout|
|
||||||
|`{{.LastSuccess}}`|The datetime of the last successful check as a go Time struct|
|
|`{{.LastSuccess}}`|The ISO datetime of the last successful check|
|
||||||
|`{{.MonitorName}}`|The name of the monitor that failed and triggered the alert|
|
|`{{.MonitorName}}`|The name of the monitor that failed and triggered the alert|
|
||||||
|`{{.IsUp}}`|Indicates if the monitor that is alerting is up or not. Can be used in a conditional message template|
|
|`{{.IsUp}}`|Indicates if the monitor that is alerting is up or not. Can be used in a conditional message template|
|
||||||
|
|
||||||
To provide flexible formatting, the following non-standard functions are available in templates:
|
|
||||||
|
|
||||||
|func|description|
|
|
||||||
|---|---|
|
|
||||||
|`ANSIC <Time>`|Formats provided time in ANSIC format|
|
|
||||||
|`UnixDate <Time>`|Formats provided time in UnixDate format|
|
|
||||||
|`RubyDate <Time>`|Formats provided time in RubyDate format|
|
|
||||||
|`RFC822Z <Time>`|Formats provided time in RFC822Z format|
|
|
||||||
|`RFC850 <Time>`|Formats provided time in RFC850 format|
|
|
||||||
|`RFC1123 <Time>`|Formats provided time in RFC1123 format|
|
|
||||||
|`RFC1123Z <Time>`|Formats provided time in RFC1123Z format|
|
|
||||||
|`RFC3339 <Time>`|Formats provided time in RFC3339 format|
|
|
||||||
|`RFC3339Nano <Time>`|Formats provided time in RFC3339Nano format|
|
|
||||||
|`FormatTime <Time> <string template>`|Formats provided time according to provided template|
|
|
||||||
|`InTZ <Time> <string timezone name>`|Converts provided time to parsed timezone from the provided name|
|
|
||||||
|
|
||||||
For more information, check out the [Go documentation for the time module](https://pkg.go.dev/time@go1.20.7#pkg-constants).
|
|
||||||
|
|
||||||
### Metrics
|
### Metrics
|
||||||
|
|
||||||
Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
|
Minitor supports exporting metrics for [Prometheus](https://prometheus.io/). Prometheus is an open source tool for reading and querying metrics from different sources. Combined with another tool, [Grafana](https://grafana.com/), it allows building of charts and dashboards. You could also opt to just use Minitor to log check results, and instead do your alerting with Grafana.
|
||||||
@@ -178,7 +158,7 @@ minitor-go:
|
|||||||
check_interval: 1m30s
|
check_interval: 1m30s
|
||||||
```
|
```
|
||||||
|
|
||||||
For the time being, legacy configs for the Python version of Minitor should be compatible if you apply the `-py-compat` flag when running Minitor. Eventually, this flag will go away when later breaking changes are introduced.
|
The `-py-compat` flag has been removed. Any existing Python oriented configuration needs to be migrated to the new templates.
|
||||||
|
|
||||||
## Future
|
## Future
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
|
||||||
"text/template"
|
"text/template"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -45,70 +44,21 @@ func (alert Alert) IsValid() bool {
|
|||||||
|
|
||||||
// BuildTemplates compiles command templates for the Alert
|
// BuildTemplates compiles command templates for the Alert
|
||||||
func (alert *Alert) BuildTemplates() error {
|
func (alert *Alert) BuildTemplates() error {
|
||||||
// TODO: Remove legacy template support later after 1.0
|
|
||||||
legacy := strings.NewReplacer(
|
|
||||||
"{alert_count}", "{{.AlertCount}}",
|
|
||||||
"{alert_message}", "{{.MonitorName}} check has failed {{.FailureCount}} times",
|
|
||||||
"{failure_count}", "{{.FailureCount}}",
|
|
||||||
"{last_output}", "{{.LastCheckOutput}}",
|
|
||||||
"{last_success}", "{{.LastSuccess}}",
|
|
||||||
"{monitor_name}", "{{.MonitorName}}",
|
|
||||||
)
|
|
||||||
|
|
||||||
slog.Debugf("Building template for alert %s", alert.Name)
|
slog.Debugf("Building template for alert %s", alert.Name)
|
||||||
|
|
||||||
// Time format func factory
|
|
||||||
tff := func(formatString string) func(time.Time) string {
|
|
||||||
return func(t time.Time) string {
|
|
||||||
return t.Format(formatString)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create some functions for formatting datetimes in popular formats
|
|
||||||
timeFormatFuncs := template.FuncMap{
|
|
||||||
"ANSIC": tff(time.ANSIC),
|
|
||||||
"UnixDate": tff(time.UnixDate),
|
|
||||||
"RubyDate": tff(time.RubyDate),
|
|
||||||
"RFC822Z": tff(time.RFC822Z),
|
|
||||||
"RFC850": tff(time.RFC850),
|
|
||||||
"RFC1123": tff(time.RFC1123),
|
|
||||||
"RFC1123Z": tff(time.RFC1123Z),
|
|
||||||
"RFC3339": tff(time.RFC3339),
|
|
||||||
"RFC3339Nano": tff(time.RFC3339Nano),
|
|
||||||
"FormatTime": func(t time.Time, timeFormat string) string {
|
|
||||||
return t.Format(timeFormat)
|
|
||||||
},
|
|
||||||
"InTZ": func(t time.Time, tzName string) (time.Time, error) {
|
|
||||||
tz, err := time.LoadLocation(tzName)
|
|
||||||
if err != nil {
|
|
||||||
return t, fmt.Errorf("failed to convert time to specified tz: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return t.In(tz), nil
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case alert.commandTemplate == nil && alert.Command.Command != nil:
|
case alert.commandTemplate == nil && alert.Command.Command != nil:
|
||||||
alert.commandTemplate = []*template.Template{}
|
alert.commandTemplate = []*template.Template{}
|
||||||
for i, cmdPart := range alert.Command.Command {
|
for i, cmdPart := range alert.Command.Command {
|
||||||
if PyCompat {
|
|
||||||
cmdPart = legacy.Replace(cmdPart)
|
|
||||||
}
|
|
||||||
|
|
||||||
alert.commandTemplate = append(alert.commandTemplate, template.Must(
|
alert.commandTemplate = append(alert.commandTemplate, template.Must(
|
||||||
template.New(alert.Name+fmt.Sprint(i)).Funcs(timeFormatFuncs).Parse(cmdPart),
|
template.New(alert.Name+fmt.Sprint(i)).Parse(cmdPart),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "":
|
case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "":
|
||||||
shellCmd := alert.Command.ShellCommand
|
shellCmd := alert.Command.ShellCommand
|
||||||
|
|
||||||
if PyCompat {
|
|
||||||
shellCmd = legacy.Replace(shellCmd)
|
|
||||||
}
|
|
||||||
|
|
||||||
alert.commandShellTemplate = template.Must(
|
alert.commandShellTemplate = template.Must(
|
||||||
template.New(alert.Name).Funcs(timeFormatFuncs).Parse(shellCmd),
|
template.New(alert.Name).Parse(shellCmd),
|
||||||
)
|
)
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate)
|
return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate)
|
||||||
@@ -168,7 +118,7 @@ func (alert Alert) Send(notice AlertNotice) (outputStr string, err error) {
|
|||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err = fmt.Errorf(
|
err = fmt.Errorf(
|
||||||
"Alert %s failed to send. Returned %w: %w",
|
"Alert '%s' failed to send. Returned %v: %w",
|
||||||
alert.Name,
|
alert.Name,
|
||||||
err,
|
err,
|
||||||
ErrAlertFailed,
|
ErrAlertFailed,
|
||||||
|
|||||||
@@ -70,14 +70,6 @@ func TestAlertSend(t *testing.T) {
|
|||||||
"Command shell with bad template",
|
"Command shell with bad template",
|
||||||
false,
|
false,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
Alert{Command: CommandOrShell{ShellCommand: "echo {alert_message}"}},
|
|
||||||
AlertNotice{MonitorName: "test", FailureCount: 1},
|
|
||||||
"test check has failed 1 times\n",
|
|
||||||
false,
|
|
||||||
"Command shell with legacy template",
|
|
||||||
true,
|
|
||||||
},
|
|
||||||
// Test default log alert down
|
// Test default log alert down
|
||||||
{
|
{
|
||||||
*NewLogAlert(),
|
*NewLogAlert(),
|
||||||
@@ -100,8 +92,6 @@ func TestAlertSend(t *testing.T) {
|
|||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
log.Printf("Testing case %s", c.name)
|
log.Printf("Testing case %s", c.name)
|
||||||
// Set PyCompat to value of compat flag
|
|
||||||
PyCompat = c.pyCompat
|
|
||||||
|
|
||||||
err := c.alert.BuildTemplates()
|
err := c.alert.BuildTemplates()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -121,9 +111,6 @@ func TestAlertSend(t *testing.T) {
|
|||||||
log.Printf("Case failed: %s", c.name)
|
log.Printf("Case failed: %s", c.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set PyCompat back to default value
|
|
||||||
PyCompat = false
|
|
||||||
|
|
||||||
log.Println("-----")
|
log.Println("-----")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ var errInvalidConfig = errors.New("Invalid configuration")
|
|||||||
|
|
||||||
// Config type is contains all provided user configuration
|
// Config type is contains all provided user configuration
|
||||||
type Config struct {
|
type Config struct {
|
||||||
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
CheckInterval time.Duration `yaml:"check_interval"`
|
||||||
DefaultAlertAfter int16 `yaml:"default_alert_after"`
|
DefaultAlertAfter int16 `yaml:"default_alert_after"`
|
||||||
DefaultAlertEvery *int16 `yaml:"default_alert_every"`
|
DefaultAlertEvery *int16 `yaml:"default_alert_every"`
|
||||||
DefaultAlertDown []string `yaml:"default_alert_down"`
|
DefaultAlertDown []string `yaml:"default_alert_down"`
|
||||||
@@ -56,34 +56,6 @@ func (cos *CommandOrShell) UnmarshalYAML(unmarshal func(interface{}) error) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// SecondsOrDuration wraps a duration value for parsing a duration or seconds from YAML
|
|
||||||
// NOTE: This should be removed in favor of only parsing durations once compatibility is broken
|
|
||||||
type SecondsOrDuration struct {
|
|
||||||
value time.Duration
|
|
||||||
}
|
|
||||||
|
|
||||||
// Value returns a duration value
|
|
||||||
func (sod SecondsOrDuration) Value() time.Duration {
|
|
||||||
return sod.value
|
|
||||||
}
|
|
||||||
|
|
||||||
// UnmarshalYAML allows unmarshalling a duration value or seconds if an int was provided
|
|
||||||
func (sod *SecondsOrDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|
||||||
var seconds int64
|
|
||||||
err := unmarshal(&seconds)
|
|
||||||
|
|
||||||
if err == nil {
|
|
||||||
sod.value = time.Second * time.Duration(seconds)
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Error indicates that we don't have an int
|
|
||||||
err = unmarshal(&sod.value)
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// IsValid checks config validity and returns true if valid
|
// IsValid checks config validity and returns true if valid
|
||||||
func (config Config) IsValid() (isValid bool) {
|
func (config Config) IsValid() (isValid bool) {
|
||||||
isValid = true
|
isValid = true
|
||||||
@@ -182,18 +154,6 @@ func LoadConfig(filePath string) (config Config, err error) {
|
|||||||
|
|
||||||
slog.Debugf("Config values:\n%v\n", config)
|
slog.Debugf("Config values:\n%v\n", config)
|
||||||
|
|
||||||
// Add log alert if not present
|
|
||||||
if PyCompat {
|
|
||||||
// Initialize alerts list if not present
|
|
||||||
if config.Alerts == nil {
|
|
||||||
config.Alerts = map[string]*Alert{}
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, ok := config.Alerts["log"]; !ok {
|
|
||||||
config.Alerts["log"] = NewLogAlert()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finish initializing configuration
|
// Finish initializing configuration
|
||||||
if err = config.Init(); err != nil {
|
if err = config.Init(); err != nil {
|
||||||
return
|
return
|
||||||
|
|||||||
+3
-9
@@ -15,7 +15,6 @@ func TestLoadConfig(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{"./test/valid-config.yml", false, "Valid config file", false},
|
{"./test/valid-config.yml", false, "Valid config file", false},
|
||||||
{"./test/valid-config-default-values.yml", false, "Valid config file with default values", false},
|
{"./test/valid-config-default-values.yml", false, "Valid config file with default values", false},
|
||||||
{"./test/valid-default-log-alert.yml", false, "Valid config file with default log alert PyCompat", true},
|
|
||||||
{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
|
{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
|
||||||
{"./test/does-not-exist", true, "Invalid config path", false},
|
{"./test/does-not-exist", true, "Invalid config path", false},
|
||||||
{"./test/invalid-config-type.yml", true, "Invalid config type for key", false},
|
{"./test/invalid-config-type.yml", true, "Invalid config type for key", false},
|
||||||
@@ -25,8 +24,6 @@ func TestLoadConfig(t *testing.T) {
|
|||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
log.Printf("Testing case %s", c.name)
|
log.Printf("Testing case %s", c.name)
|
||||||
// Set PyCompat based on compatibility mode
|
|
||||||
PyCompat = c.pyCompat
|
|
||||||
_, err := LoadConfig(c.configPath)
|
_, err := LoadConfig(c.configPath)
|
||||||
hasErr := (err != nil)
|
hasErr := (err != nil)
|
||||||
|
|
||||||
@@ -34,9 +31,6 @@ func TestLoadConfig(t *testing.T) {
|
|||||||
t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err)
|
t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err)
|
||||||
log.Printf("Case failed: %s", c.name)
|
log.Printf("Case failed: %s", c.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set PyCompat to default value
|
|
||||||
PyCompat = false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,15 +47,15 @@ func TestIntervalParsing(t *testing.T) {
|
|||||||
oneMinute := time.Minute
|
oneMinute := time.Minute
|
||||||
|
|
||||||
// validate top level interval seconds represented as an int
|
// validate top level interval seconds represented as an int
|
||||||
if config.CheckInterval.Value() != oneSecond {
|
if config.CheckInterval != oneSecond {
|
||||||
t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.Monitors[0].CheckInterval.Value() != tenSeconds {
|
if config.Monitors[0].CheckInterval != tenSeconds {
|
||||||
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.Monitors[1].CheckInterval.Value() != oneMinute {
|
if config.Monitors[1].CheckInterval != oneMinute {
|
||||||
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
module git.iamthefij.com/iamthefij/minitor-go
|
module git.iamthefij.com/iamthefij/minitor-go
|
||||||
|
|
||||||
go 1.20
|
go 1.17
|
||||||
|
|
||||||
require (
|
require (
|
||||||
git.iamthefij.com/iamthefij/slog v1.3.0
|
git.iamthefij.com/iamthefij/slog v1.3.0
|
||||||
|
|||||||
@@ -1,72 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"net/http"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
type HealthCheckHandler struct {
|
|
||||||
isMinitorHealthy bool
|
|
||||||
monitors []*Monitor
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewHealthCheckHandler(monitors []*Monitor) *HealthCheckHandler {
|
|
||||||
return &HealthCheckHandler{
|
|
||||||
false,
|
|
||||||
monitors,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hch *HealthCheckHandler) MinitorHealthy(healthy bool) {
|
|
||||||
hch.isMinitorHealthy = healthy
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hch HealthCheckHandler) MinitorHealthCheck() (bool, string) {
|
|
||||||
if hch.isMinitorHealthy {
|
|
||||||
return true, "OK"
|
|
||||||
} else {
|
|
||||||
return false, "UNHEALTHY"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hch HealthCheckHandler) MonitorsHealthCheck() (bool, string) {
|
|
||||||
downMonitors := []string{}
|
|
||||||
|
|
||||||
for _, monitor := range hch.monitors {
|
|
||||||
if !monitor.IsUp() {
|
|
||||||
downMonitors = append(downMonitors, monitor.Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(downMonitors) == 0 {
|
|
||||||
return true, "OK"
|
|
||||||
} else {
|
|
||||||
return false, fmt.Sprintf("UNHEALTHY: The following monitors are unhealthy: %s", strings.Join(downMonitors, ", "))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (hch HealthCheckHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|
||||||
var healthy bool
|
|
||||||
|
|
||||||
var body string
|
|
||||||
|
|
||||||
if monitors := r.URL.Query().Get("monitors"); monitors != "" {
|
|
||||||
healthy, body = hch.MonitorsHealthCheck()
|
|
||||||
} else {
|
|
||||||
healthy, body = hch.MinitorHealthCheck()
|
|
||||||
}
|
|
||||||
|
|
||||||
if healthy {
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
} else {
|
|
||||||
w.WriteHeader(http.StatusServiceUnavailable)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, _ = io.WriteString(w, body)
|
|
||||||
}
|
|
||||||
|
|
||||||
func HandleHealthCheck() {
|
|
||||||
http.Handle("/metrics", HealthChecks)
|
|
||||||
}
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestNewHealthCheck(t *testing.T) {
|
|
||||||
monitors := []*Monitor{
|
|
||||||
{Name: "Test Monitor"},
|
|
||||||
}
|
|
||||||
hc := NewHealthCheckHandler(monitors)
|
|
||||||
|
|
||||||
monitors[0].alertCount++
|
|
||||||
|
|
||||||
if healthy, _ := hc.MinitorHealthCheck(); healthy {
|
|
||||||
t.Errorf("Initial hc state should be unhealthy until some successful alert is sent")
|
|
||||||
}
|
|
||||||
|
|
||||||
if healthy, _ := hc.MonitorsHealthCheck(); healthy {
|
|
||||||
t.Errorf("Faking an alert on the monitor pointer should make this unhealthy")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMinitorHealthCheck(t *testing.T) {
|
|
||||||
monitors := []*Monitor{
|
|
||||||
{Name: "Test Monitor"},
|
|
||||||
}
|
|
||||||
hc := NewHealthCheckHandler(monitors)
|
|
||||||
|
|
||||||
t.Run("MinitorHealthCheck(healthy)", func(t *testing.T) {
|
|
||||||
hc.MinitorHealthy(true)
|
|
||||||
healthy, body := hc.MinitorHealthCheck()
|
|
||||||
if !healthy {
|
|
||||||
t.Errorf("Expected healthy check")
|
|
||||||
}
|
|
||||||
if body != "OK" {
|
|
||||||
t.Errorf("Expected OK response")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("MinitorHealthCheck(unhealthy)", func(t *testing.T) {
|
|
||||||
hc.MinitorHealthy(false)
|
|
||||||
healthy, body := hc.MinitorHealthCheck()
|
|
||||||
if healthy {
|
|
||||||
t.Errorf("Expected healthy check")
|
|
||||||
}
|
|
||||||
if body != "UNHEALTHY" {
|
|
||||||
t.Errorf("Expected UNHEALTHY response")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMonitorsHealthCheck(t *testing.T) {
|
|
||||||
monitors := []*Monitor{
|
|
||||||
{Name: "Test Monitor"},
|
|
||||||
}
|
|
||||||
hc := NewHealthCheckHandler(monitors)
|
|
||||||
|
|
||||||
t.Run("MonitorsHealthCheck(healthy)", func(t *testing.T) {
|
|
||||||
healthy, body := hc.MonitorsHealthCheck()
|
|
||||||
if !healthy {
|
|
||||||
t.Errorf("Expected healthy check")
|
|
||||||
}
|
|
||||||
if body != "OK" {
|
|
||||||
t.Errorf("Expected OK response")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("MonitorsHealthCheck(unhealthy)", func(t *testing.T) {
|
|
||||||
monitors[0].alertCount++
|
|
||||||
healthy, body := hc.MonitorsHealthCheck()
|
|
||||||
if healthy {
|
|
||||||
t.Errorf("Expected healthy check")
|
|
||||||
}
|
|
||||||
if body != "UNHEALTHY: The following monitors are unhealthy: Test Monitor" {
|
|
||||||
t.Errorf("Expected UNHEALTHY response")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@@ -4,7 +4,6 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"git.iamthefij.com/iamthefij/slog"
|
"git.iamthefij.com/iamthefij/slog"
|
||||||
@@ -17,13 +16,6 @@ var (
|
|||||||
MetricsPort = 8080
|
MetricsPort = 8080
|
||||||
// Metrics contains all active metrics
|
// Metrics contains all active metrics
|
||||||
Metrics = NewMetrics()
|
Metrics = NewMetrics()
|
||||||
// Self monitor rather than panicing
|
|
||||||
SelfMonitor = false
|
|
||||||
// HealthChecks contains health check values
|
|
||||||
HealthChecks *HealthCheckHandler = nil
|
|
||||||
|
|
||||||
// PyCompat enables support for legacy Python templates
|
|
||||||
PyCompat = false
|
|
||||||
|
|
||||||
// version of minitor being run
|
// version of minitor being run
|
||||||
version = "dev"
|
version = "dev"
|
||||||
@@ -56,13 +48,7 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
|
|||||||
output,
|
output,
|
||||||
)
|
)
|
||||||
|
|
||||||
if SelfMonitor {
|
|
||||||
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), false)
|
|
||||||
}
|
|
||||||
|
|
||||||
return err
|
return err
|
||||||
} else {
|
|
||||||
Metrics.SetMonitorStatus(fmt.Sprintf("Alert %s", alertName), true)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count alert metrics
|
// Count alert metrics
|
||||||
@@ -80,8 +66,6 @@ func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) erro
|
|||||||
|
|
||||||
func checkMonitors(config *Config) error {
|
func checkMonitors(config *Config) error {
|
||||||
// TODO: Run this in goroutines and capture exceptions
|
// TODO: Run this in goroutines and capture exceptions
|
||||||
healthy := true
|
|
||||||
|
|
||||||
for _, monitor := range config.Monitors {
|
for _, monitor := range config.Monitors {
|
||||||
if monitor.ShouldCheck() {
|
if monitor.ShouldCheck() {
|
||||||
success, alertNotice := monitor.Check()
|
success, alertNotice := monitor.Check()
|
||||||
@@ -93,42 +77,24 @@ func checkMonitors(config *Config) error {
|
|||||||
|
|
||||||
if alertNotice != nil {
|
if alertNotice != nil {
|
||||||
err := sendAlerts(config, monitor, alertNotice)
|
err := sendAlerts(config, monitor, alertNotice)
|
||||||
// If there was an error in sending an alert, mark as unhealthy or bubble up
|
// If there was an error in sending an alert, exit early and bubble it up
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if SelfMonitor {
|
|
||||||
healthy = false
|
|
||||||
} else {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if HealthChecks != nil {
|
|
||||||
HealthChecks.MinitorHealthy(healthy)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ServeMetricsAndHealth starts the default http server
|
|
||||||
func ServeMetricsAndHealth() {
|
|
||||||
host := fmt.Sprintf(":%d", MetricsPort)
|
|
||||||
|
|
||||||
_ = http.ListenAndServe(host, nil)
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
|
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
|
||||||
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
|
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
|
||||||
|
|
||||||
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
|
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
|
||||||
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
|
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
|
||||||
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
|
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
|
||||||
flag.IntVar(&MetricsPort, "metrics-port", MetricsPort, "The port that Prometheus metrics and healthchecks should be exported on, if enabled. (default: 8080)")
|
|
||||||
flag.BoolVar(&SelfMonitor, "self-monitor", false, "Enables self-monitoring. Export metrics rather than panic when alerts fail. (default: false)")
|
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
// Print version if flag is provided
|
// Print version if flag is provided
|
||||||
@@ -145,19 +111,8 @@ func main() {
|
|||||||
// Serve metrics exporter, if specified
|
// Serve metrics exporter, if specified
|
||||||
if ExportMetrics {
|
if ExportMetrics {
|
||||||
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
|
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
|
||||||
HandleMetrics()
|
|
||||||
}
|
|
||||||
|
|
||||||
if SelfMonitor {
|
go ServeMetrics()
|
||||||
slog.Infof("Starting healthcheck endpoint on port %d", MetricsPort)
|
|
||||||
|
|
||||||
HealthChecks = NewHealthCheckHandler(config.Monitors)
|
|
||||||
|
|
||||||
HandleHealthCheck()
|
|
||||||
}
|
|
||||||
|
|
||||||
if ExportMetrics || SelfMonitor {
|
|
||||||
go ServeMetricsAndHealth()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start main loop
|
// Start main loop
|
||||||
@@ -165,6 +120,6 @@ func main() {
|
|||||||
err = checkMonitors(&config)
|
err = checkMonitors(&config)
|
||||||
slog.OnErrPanicf(err, "Error checking monitors")
|
slog.OnErrPanicf(err, "Error checking monitors")
|
||||||
|
|
||||||
time.Sleep(config.CheckInterval.Value())
|
time.Sleep(config.CheckInterval)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
config Config
|
config Config
|
||||||
expectErr bool
|
expectErr bool
|
||||||
name string
|
name string
|
||||||
selfMonitor bool
|
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
config: Config{},
|
config: Config{},
|
||||||
@@ -25,7 +24,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor success, no alerts",
|
name: "Monitor success, no alerts",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@@ -39,7 +37,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor failure, no alerts",
|
name: "Monitor failure, no alerts",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@@ -53,7 +50,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor recovery, no alerts",
|
name: "Monitor recovery, no alerts",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@@ -68,7 +64,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
name: "Monitor failure, unknown alerts",
|
name: "Monitor failure, unknown alerts",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@@ -83,22 +78,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
name: "Monitor recovery, unknown alerts",
|
name: "Monitor recovery, unknown alerts",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
config: Config{
|
|
||||||
Monitors: []*Monitor{
|
|
||||||
{
|
|
||||||
Name: "Success",
|
|
||||||
Command: CommandOrShell{Command: []string{"true"}},
|
|
||||||
AlertUp: []string{"unknown"},
|
|
||||||
alertCount: 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
expectErr: false,
|
|
||||||
name: "Monitor recovery, unknown alerts, with Health Check",
|
|
||||||
selfMonitor: true,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@@ -118,7 +97,6 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: false,
|
expectErr: false,
|
||||||
name: "Monitor failure, successful alert",
|
name: "Monitor failure, successful alert",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
config: Config{
|
config: Config{
|
||||||
@@ -139,34 +117,10 @@ func TestCheckMonitors(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectErr: true,
|
expectErr: true,
|
||||||
name: "Monitor failure, bad alert",
|
name: "Monitor failure, bad alert",
|
||||||
selfMonitor: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
config: Config{
|
|
||||||
Monitors: []*Monitor{
|
|
||||||
{
|
|
||||||
Name: "Failure",
|
|
||||||
Command: CommandOrShell{Command: []string{"false"}},
|
|
||||||
AlertDown: []string{"bad"},
|
|
||||||
AlertAfter: 1,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Alerts: map[string]*Alert{
|
|
||||||
"bad": {
|
|
||||||
Name: "bad",
|
|
||||||
Command: CommandOrShell{Command: []string{"false"}},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
expectErr: false,
|
|
||||||
name: "Monitor failure, bad alert, with Health Check",
|
|
||||||
selfMonitor: true,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
SelfMonitor = c.selfMonitor
|
|
||||||
|
|
||||||
err := c.config.Init()
|
err := c.config.Init()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
|
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
|
||||||
|
|||||||
+7
-2
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
@@ -106,7 +107,11 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
|
|||||||
).Inc()
|
).Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
// HandleMetrics add Prometheus metrics handler to default http server
|
// ServeMetrics starts an http server with a Prometheus metrics handler
|
||||||
func HandleMetrics() {
|
func ServeMetrics() {
|
||||||
http.Handle("/metrics", promhttp.Handler())
|
http.Handle("/metrics", promhttp.Handler())
|
||||||
|
|
||||||
|
host := fmt.Sprintf(":%d", MetricsPort)
|
||||||
|
|
||||||
|
_ = http.ListenAndServe(host, nil)
|
||||||
}
|
}
|
||||||
|
|||||||
+2
-2
@@ -13,7 +13,7 @@ type Monitor struct { //nolint:maligned
|
|||||||
// Config values
|
// Config values
|
||||||
AlertAfter int16 `yaml:"alert_after"`
|
AlertAfter int16 `yaml:"alert_after"`
|
||||||
AlertEvery *int16 `yaml:"alert_every"`
|
AlertEvery *int16 `yaml:"alert_every"`
|
||||||
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
CheckInterval time.Duration `yaml:"check_interval"`
|
||||||
Name string
|
Name string
|
||||||
AlertDown []string `yaml:"alert_down"`
|
AlertDown []string `yaml:"alert_down"`
|
||||||
AlertUp []string `yaml:"alert_up"`
|
AlertUp []string `yaml:"alert_up"`
|
||||||
@@ -45,7 +45,7 @@ func (monitor Monitor) ShouldCheck() bool {
|
|||||||
|
|
||||||
sinceLastCheck := time.Since(monitor.lastCheck)
|
sinceLastCheck := time.Since(monitor.lastCheck)
|
||||||
|
|
||||||
return sinceLastCheck >= monitor.CheckInterval.Value()
|
return sinceLastCheck >= monitor.CheckInterval
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check will run the command configured by the Monitor and return a status
|
// Check will run the command configured by the Monitor and return a status
|
||||||
|
|||||||
+3
-3
@@ -45,9 +45,9 @@ func TestMonitorShouldCheck(t *testing.T) {
|
|||||||
name string
|
name string
|
||||||
}{
|
}{
|
||||||
{Monitor{}, true, "Empty"},
|
{Monitor{}, true, "Empty"},
|
||||||
{Monitor{lastCheck: timeNow, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "Just checked"},
|
{Monitor{lastCheck: timeNow, CheckInterval: time.Second * 15}, false, "Just checked"},
|
||||||
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "-10s"},
|
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: time.Second * 15}, false, "-10s"},
|
||||||
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, true, "-20s"},
|
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: time.Second * 15}, true, "-20s"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
check_interval: 1
|
check_interval: 1s
|
||||||
|
|
||||||
monitors:
|
monitors:
|
||||||
- name: Command
|
- name: Command
|
||||||
|
|||||||
Reference in New Issue
Block a user