Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
60cfac948b |
@@ -9,6 +9,7 @@ linters:
|
|||||||
- gomnd
|
- gomnd
|
||||||
- goprintffuncname
|
- goprintffuncname
|
||||||
- misspell
|
- misspell
|
||||||
|
- promlinter
|
||||||
- tagliatelle
|
- tagliatelle
|
||||||
- tenv
|
- tenv
|
||||||
- testpackage
|
- testpackage
|
||||||
|
|||||||
@@ -55,9 +55,6 @@ The global configurations are:
|
|||||||
|key|value|
|
|key|value|
|
||||||
|---|---|
|
|---|---|
|
||||||
|`check_interval`|Maximum frequency to run checks for each monitor as duration, eg. 1m2s.|
|
|`check_interval`|Maximum frequency to run checks for each monitor as duration, eg. 1m2s.|
|
||||||
|`default_alert_after`|A default value used as an `alert_after` value for a monitor if not specified or 0.|
|
|
||||||
|`default_alert_down`|Default down alerts to used by a monitor in case none are provided.|
|
|
||||||
|`default_alert_up`|Default up alerts to used by a monitor in case none are provided.|
|
|
||||||
|`monitors`|List of all monitors. Detailed description below|
|
|`monitors`|List of all monitors. Detailed description below|
|
||||||
|`alerts`|List of all alerts. Detailed description below|
|
|`alerts`|List of all alerts. Detailed description below|
|
||||||
|
|
||||||
|
|||||||
@@ -13,13 +13,9 @@ var errInvalidConfig = errors.New("Invalid configuration")
|
|||||||
|
|
||||||
// Config type is contains all provided user configuration
|
// Config type is contains all provided user configuration
|
||||||
type Config struct {
|
type Config struct {
|
||||||
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
||||||
DefaultAlertAfter int16 `yaml:"default_alert_after"`
|
Monitors []*Monitor
|
||||||
DefaultAlertEvery *int16 `yaml:"default_alert_every"`
|
Alerts map[string]*Alert
|
||||||
DefaultAlertDown []string `yaml:"default_alert_down"`
|
|
||||||
DefaultAlertUp []string `yaml:"default_alert_up"`
|
|
||||||
Monitors []*Monitor
|
|
||||||
Alerts map[string]*Alert
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// CommandOrShell type wraps a string or list of strings
|
// CommandOrShell type wraps a string or list of strings
|
||||||
@@ -139,27 +135,8 @@ func (config Config) IsValid() (isValid bool) {
|
|||||||
|
|
||||||
// Init performs extra initialization on top of loading the config from file
|
// Init performs extra initialization on top of loading the config from file
|
||||||
func (config *Config) Init() (err error) {
|
func (config *Config) Init() (err error) {
|
||||||
for _, monitor := range config.Monitors {
|
|
||||||
if monitor.AlertAfter == 0 && config.DefaultAlertAfter > 0 {
|
|
||||||
monitor.AlertAfter = config.DefaultAlertAfter
|
|
||||||
}
|
|
||||||
|
|
||||||
if monitor.AlertEvery == nil && config.DefaultAlertEvery != nil {
|
|
||||||
monitor.AlertEvery = config.DefaultAlertEvery
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(monitor.AlertDown) == 0 && len(config.DefaultAlertDown) > 0 {
|
|
||||||
monitor.AlertDown = config.DefaultAlertDown
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(monitor.AlertUp) == 0 && len(config.DefaultAlertUp) > 0 {
|
|
||||||
monitor.AlertUp = config.DefaultAlertUp
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for name, alert := range config.Alerts {
|
for name, alert := range config.Alerts {
|
||||||
alert.Name = name
|
alert.Name = name
|
||||||
|
|
||||||
if err = alert.BuildTemplates(); err != nil {
|
if err = alert.BuildTemplates(); err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ func TestLoadConfig(t *testing.T) {
|
|||||||
pyCompat bool
|
pyCompat bool
|
||||||
}{
|
}{
|
||||||
{"./test/valid-config.yml", false, "Valid config file", false},
|
{"./test/valid-config.yml", false, "Valid config file", false},
|
||||||
{"./test/valid-config-default-values.yml", false, "Valid config file with default values", false},
|
|
||||||
{"./test/valid-default-log-alert.yml", false, "Valid config file with default log alert PyCompat", true},
|
{"./test/valid-default-log-alert.yml", false, "Valid config file with default log alert PyCompat", true},
|
||||||
{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
|
{"./test/valid-default-log-alert.yml", true, "Invalid config file no log alert", false},
|
||||||
{"./test/does-not-exist", true, "Invalid config path", false},
|
{"./test/does-not-exist", true, "Invalid config path", false},
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ func checkMonitors(config *Config) error {
|
|||||||
|
|
||||||
// Track status metrics
|
// Track status metrics
|
||||||
Metrics.SetMonitorStatus(monitor.Name, monitor.IsUp())
|
Metrics.SetMonitorStatus(monitor.Name, monitor.IsUp())
|
||||||
Metrics.CountCheck(monitor.Name, success, monitor.LastCheckMilliseconds(), hasAlert)
|
Metrics.CountCheck(monitor.Name, success, monitor.LastCheckSeconds(), hasAlert)
|
||||||
|
|
||||||
if alertNotice != nil {
|
if alertNotice != nil {
|
||||||
err := sendAlerts(config, monitor, alertNotice)
|
err := sendAlerts(config, monitor, alertNotice)
|
||||||
|
|||||||
+4
-4
@@ -43,14 +43,14 @@ func NewMetrics() *MinitorMetrics {
|
|||||||
),
|
),
|
||||||
checkTime: prometheus.NewGaugeVec(
|
checkTime: prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Name: "minitor_check_milliseconds",
|
Name: "minitor_check_seconds",
|
||||||
Help: "Time in miliseconds that a check ran for",
|
Help: "Time in miliseconds that a check ran for",
|
||||||
},
|
},
|
||||||
[]string{"monitor", "status"},
|
[]string{"monitor", "status"},
|
||||||
),
|
),
|
||||||
monitorStatus: prometheus.NewGaugeVec(
|
monitorStatus: prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Name: "minitor_monitor_up_count",
|
Name: "minitor_monitor_up",
|
||||||
Help: "Status of currently responsive monitors",
|
Help: "Status of currently responsive monitors",
|
||||||
},
|
},
|
||||||
[]string{"monitor"},
|
[]string{"monitor"},
|
||||||
@@ -77,7 +77,7 @@ func (metrics *MinitorMetrics) SetMonitorStatus(monitor string, isUp bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// CountCheck counts the result of a particular Monitor check
|
// CountCheck counts the result of a particular Monitor check
|
||||||
func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, ms int64, isAlert bool) {
|
func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, secs float64, isAlert bool) {
|
||||||
status := "failure"
|
status := "failure"
|
||||||
if isSuccess {
|
if isSuccess {
|
||||||
status = "success"
|
status = "success"
|
||||||
@@ -94,7 +94,7 @@ func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, ms int
|
|||||||
|
|
||||||
metrics.checkTime.With(
|
metrics.checkTime.With(
|
||||||
prometheus.Labels{"monitor": monitor, "status": status},
|
prometheus.Labels{"monitor": monitor, "status": status},
|
||||||
).Set(float64(ms))
|
).Set(secs)
|
||||||
}
|
}
|
||||||
|
|
||||||
// CountAlert counts an alert
|
// CountAlert counts an alert
|
||||||
|
|||||||
+10
-10
@@ -12,7 +12,7 @@ import (
|
|||||||
type Monitor struct { //nolint:maligned
|
type Monitor struct { //nolint:maligned
|
||||||
// Config values
|
// Config values
|
||||||
AlertAfter int16 `yaml:"alert_after"`
|
AlertAfter int16 `yaml:"alert_after"`
|
||||||
AlertEvery *int16 `yaml:"alert_every"`
|
AlertEvery int16 `yaml:"alert_every"`
|
||||||
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
CheckInterval SecondsOrDuration `yaml:"check_interval"`
|
||||||
Name string
|
Name string
|
||||||
AlertDown []string `yaml:"alert_down"`
|
AlertDown []string `yaml:"alert_down"`
|
||||||
@@ -91,9 +91,9 @@ func (monitor Monitor) IsUp() bool {
|
|||||||
return monitor.alertCount == 0
|
return monitor.alertCount == 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// LastCheckMilliseconds gives number of miliseconds the last check ran for
|
// LastCheckSeconds gives number of seconds the last check ran for
|
||||||
func (monitor Monitor) LastCheckMilliseconds() int64 {
|
func (monitor Monitor) LastCheckSeconds() float64 {
|
||||||
return monitor.lastCheckDuration.Milliseconds()
|
return monitor.lastCheckDuration.Seconds()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (monitor *Monitor) success() (notice *AlertNotice) {
|
func (monitor *Monitor) success() (notice *AlertNotice) {
|
||||||
@@ -129,14 +129,14 @@ func (monitor *Monitor) failure() (notice *AlertNotice) {
|
|||||||
|
|
||||||
// Use alert cadence to determine if we should alert
|
// Use alert cadence to determine if we should alert
|
||||||
switch {
|
switch {
|
||||||
case monitor.AlertEvery == nil, *monitor.AlertEvery == 0:
|
case monitor.AlertEvery > 0:
|
||||||
// Handle alerting on first failure only
|
// Handle integer number of failures before alerting
|
||||||
if failureCount == 0 {
|
if failureCount%monitor.AlertEvery == 0 {
|
||||||
notice = monitor.createAlertNotice(false)
|
notice = monitor.createAlertNotice(false)
|
||||||
}
|
}
|
||||||
case *monitor.AlertEvery > 0:
|
case monitor.AlertEvery == 0:
|
||||||
// Handle integer number of failures before alerting
|
// Handle alerting on first failure only
|
||||||
if failureCount%*monitor.AlertEvery == 0 {
|
if failureCount == 0 {
|
||||||
notice = monitor.createAlertNotice(false)
|
notice = monitor.createAlertNotice(false)
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|||||||
+17
-26
@@ -141,19 +141,17 @@ func TestMonitorSuccess(t *testing.T) {
|
|||||||
// TestMonitorFailureAlertAfter tests that alerts will not trigger until
|
// TestMonitorFailureAlertAfter tests that alerts will not trigger until
|
||||||
// hitting the threshold provided by AlertAfter
|
// hitting the threshold provided by AlertAfter
|
||||||
func TestMonitorFailureAlertAfter(t *testing.T) {
|
func TestMonitorFailureAlertAfter(t *testing.T) {
|
||||||
var alertEvery int16 = 1
|
|
||||||
|
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
monitor Monitor
|
monitor Monitor
|
||||||
expectNotice bool
|
expectNotice bool
|
||||||
name string
|
name string
|
||||||
}{
|
}{
|
||||||
{Monitor{AlertAfter: 1}, true, "Empty"}, // Defaults to true because and AlertEvery default to 0
|
{Monitor{AlertAfter: 1}, true, "Empty"}, // Defaults to true because and AlertEvery default to 0
|
||||||
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: &alertEvery}, true, "Alert after 1: first failure"},
|
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: 1}, true, "Alert after 1: first failure"},
|
||||||
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: &alertEvery}, true, "Alert after 1: second failure"},
|
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: 1}, true, "Alert after 1: second failure"},
|
||||||
{Monitor{failureCount: 0, AlertAfter: 20, AlertEvery: &alertEvery}, false, "Alert after 20: first failure"},
|
{Monitor{failureCount: 0, AlertAfter: 20, AlertEvery: 1}, false, "Alert after 20: first failure"},
|
||||||
{Monitor{failureCount: 19, AlertAfter: 20, AlertEvery: &alertEvery}, true, "Alert after 20: 20th failure"},
|
{Monitor{failureCount: 19, AlertAfter: 20, AlertEvery: 1}, true, "Alert after 20: 20th failure"},
|
||||||
{Monitor{failureCount: 20, AlertAfter: 20, AlertEvery: &alertEvery}, true, "Alert after 20: 21st failure"},
|
{Monitor{failureCount: 20, AlertAfter: 20, AlertEvery: 1}, true, "Alert after 20: 21st failure"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
@@ -174,11 +172,6 @@ func TestMonitorFailureAlertAfter(t *testing.T) {
|
|||||||
// TestMonitorFailureAlertEvery tests that alerts will trigger
|
// TestMonitorFailureAlertEvery tests that alerts will trigger
|
||||||
// on the expected intervals
|
// on the expected intervals
|
||||||
func TestMonitorFailureAlertEvery(t *testing.T) {
|
func TestMonitorFailureAlertEvery(t *testing.T) {
|
||||||
var alertEvery0, alertEvery1, alertEvery2 int16
|
|
||||||
alertEvery0 = 0
|
|
||||||
alertEvery1 = 1
|
|
||||||
alertEvery2 = 2
|
|
||||||
|
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
monitor Monitor
|
monitor Monitor
|
||||||
expectNotice bool
|
expectNotice bool
|
||||||
@@ -193,20 +186,20 @@ func TestMonitorFailureAlertEvery(t *testing.T) {
|
|||||||
|
|
||||||
For usabilty, this should be consistent. Consistent with what though? minitor-py? Or itself? Dun dun duuuunnnnn!
|
For usabilty, this should be consistent. Consistent with what though? minitor-py? Or itself? Dun dun duuuunnnnn!
|
||||||
*/
|
*/
|
||||||
{Monitor{AlertAfter: 1}, true, "Empty"}, // Defaults to true because AlertAfter and AlertEvery default to nil
|
{Monitor{AlertAfter: 1}, true, "Empty"}, // Defaults to true because AlertAfter and AlertEvery default to 0
|
||||||
// Alert first time only, after 1
|
// Alert first time only, after 1
|
||||||
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: &alertEvery0}, true, "Alert first time only after 1: first failure"},
|
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: 0}, true, "Alert first time only after 1: first failure"},
|
||||||
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: &alertEvery0}, false, "Alert first time only after 1: second failure"},
|
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: 0}, false, "Alert first time only after 1: second failure"},
|
||||||
{Monitor{failureCount: 2, AlertAfter: 1, AlertEvery: &alertEvery0}, false, "Alert first time only after 1: third failure"},
|
{Monitor{failureCount: 2, AlertAfter: 1, AlertEvery: 0}, false, "Alert first time only after 1: third failure"},
|
||||||
// Alert every time, after 1
|
// Alert every time, after 1
|
||||||
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: &alertEvery1}, true, "Alert every time after 1: first failure"},
|
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: 1}, true, "Alert every time after 1: first failure"},
|
||||||
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: &alertEvery1}, true, "Alert every time after 1: second failure"},
|
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: 1}, true, "Alert every time after 1: second failure"},
|
||||||
{Monitor{failureCount: 2, AlertAfter: 1, AlertEvery: &alertEvery1}, true, "Alert every time after 1: third failure"},
|
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: 1}, true, "Alert every time after 1: third failure"},
|
||||||
// Alert every other time, after 1
|
// Alert every other time, after 1
|
||||||
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: &alertEvery2}, true, "Alert every other time after 1: first failure"},
|
{Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: 2}, true, "Alert every other time after 1: first failure"},
|
||||||
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: &alertEvery2}, false, "Alert every other time after 1: second failure"},
|
{Monitor{failureCount: 1, AlertAfter: 1, AlertEvery: 2}, false, "Alert every other time after 1: second failure"},
|
||||||
{Monitor{failureCount: 2, AlertAfter: 1, AlertEvery: &alertEvery2}, true, "Alert every other time after 1: third failure"},
|
{Monitor{failureCount: 2, AlertAfter: 1, AlertEvery: 2}, true, "Alert every other time after 1: third failure"},
|
||||||
{Monitor{failureCount: 3, AlertAfter: 1, AlertEvery: &alertEvery2}, false, "Alert every other time after 1: fourth failure"},
|
{Monitor{failureCount: 3, AlertAfter: 1, AlertEvery: 2}, false, "Alert every other time after 1: fourth failure"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
@@ -227,8 +220,6 @@ func TestMonitorFailureAlertEvery(t *testing.T) {
|
|||||||
// TestMonitorFailureExponential tests that alerts will trigger
|
// TestMonitorFailureExponential tests that alerts will trigger
|
||||||
// with an exponential backoff after repeated failures
|
// with an exponential backoff after repeated failures
|
||||||
func TestMonitorFailureExponential(t *testing.T) {
|
func TestMonitorFailureExponential(t *testing.T) {
|
||||||
var alertEveryExp int16 = -1
|
|
||||||
|
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
expectNotice bool
|
expectNotice bool
|
||||||
name string
|
name string
|
||||||
@@ -245,7 +236,7 @@ func TestMonitorFailureExponential(t *testing.T) {
|
|||||||
|
|
||||||
// Unlike previous tests, this one requires a static Monitor with repeated
|
// Unlike previous tests, this one requires a static Monitor with repeated
|
||||||
// calls to the failure method
|
// calls to the failure method
|
||||||
monitor := Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: &alertEveryExp}
|
monitor := Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: -1}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
log.Printf("Testing case %s", c.name)
|
log.Printf("Testing case %s", c.name)
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
---
|
|
||||||
check_interval: 1
|
|
||||||
default_alert_down: ["log_command"]
|
|
||||||
default_alert_after: 1
|
|
||||||
|
|
||||||
monitors:
|
|
||||||
- name: Command
|
|
||||||
command: ["echo", "$PATH"]
|
|
||||||
|
|
||||||
alerts:
|
|
||||||
log_command:
|
|
||||||
command: ["echo", "regular", '"command!!!"', "{{.MonitorName}}"]
|
|
||||||
Reference in New Issue
Block a user