Compare commits

..

14 Commits

Author SHA1 Message Date
Ian Fijolek
49e3635819 Add backwards compatility explanation in Readme 2021-05-12 16:37:59 -07:00
Ian Fijolek
860c2cdf43 Add custom type to parse out seconds as int and durations as strings 2021-05-12 10:33:42 -07:00
Ian Fijolek
04395fa693 Add duration parsing tests 2021-05-11 10:41:39 -07:00
Ian Fijolek
bdf7355fa7 Add duration parsing for intervals 2021-05-11 10:41:39 -07:00
Ian Fijolek
befea7375f Add check runtime metric 2021-05-11 10:41:39 -07:00
Ian Fijolek
30c2c7d6b2 Add Dockerfile linting back in 2021-05-10 21:53:26 -07:00
Ian Fijolek
5f250f17a8 Add more liniting and update to pass 2021-05-10 21:53:26 -07:00
Ian Fijolek
fda9e1bfc3 Replace log with slog 2021-05-10 21:53:26 -07:00
Ian Fijolek
f0e179851f Update linting and a test case 2021-01-08 18:31:22 -05:00
Ian Fijolek
9e124803da Add release uploads 2021-01-08 18:13:48 -05:00
Ian Fijolek
2c4543a7bc Update go version to 1.15 2021-01-08 18:13:34 -05:00
Ian Fijolek
a1b906b94a Update for go 1.15 2020-11-16 15:56:31 -08:00
Ian Fijolek
0a5be250b5 Scripts: Add echoing log lines to helper scripts
Rather than only returning the status of whether or not a container is
healhthy, the helper scripts will now optionally echo some of the latest
log lines.
2020-11-16 15:52:21 -08:00
Ian Fijolek
88f77aa27c Fix Makefile comment 2020-11-16 15:51:41 -08:00
22 changed files with 506 additions and 201 deletions
+45 -27
View File
@@ -3,33 +3,15 @@ kind: pipeline
name: test
steps:
- name: test
image: golang:1.12
image: golang:1.15
environment:
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
commands:
- make build
- make test
- name: check
image: python:3
commands:
- pip install pre-commit==1.20.0
- make check
- name: notify
image: drillster/drone-email
settings:
host:
from_secret: SMTP_HOST
username:
from_secret: SMTP_USER
password:
from_secret: SMTP_PASS
from: drone@iamthefij.com
when:
status: [changed, failure]
image: iamthefij/drone-pre-commit:personal
---
kind: pipeline
@@ -48,11 +30,35 @@ trigger:
steps:
- name: build all binaries
image: golang:1.12
image: golang:1.15
environment:
VERSION: ${DRONE_TAG:-${DRONE_COMMIT}}
commands:
- make all-linux
- make all
- name: compress binaries for release
image: ubuntu
commands:
- find ./dist -type f -executable -execdir tar -czvf {}.tar.gz {} \;
when:
event: tag
- name: upload gitea release
image: plugins/gitea-release
settings:
title: ${DRONE_TAG}
files: dist/*.tar.gz
checksum:
- md5
- sha1
- sha256
- sha512
base_url:
from_secret: gitea_base_url
api_key:
from_secret: gitea_token
when:
event: tag
- name: push image - arm
image: plugins/docker
@@ -104,15 +110,27 @@ steps:
password:
from_secret: docker_password
---
kind: pipeline
name: notify
depends_on:
- test
- publish
trigger:
status:
- failure
steps:
- name: notify
image: drillster/drone-email
settings:
host:
from_secret: SMTP_HOST
from_secret: SMTP_HOST # pragma: whitelist secret
username:
from_secret: SMTP_USER
from_secret: SMTP_USER # pragma: whitelist secret
password:
from_secret: SMTP_PASS
from_secret: SMTP_PASS # pragma: whitelist secret
from: drone@iamthefij.com
when:
status: [changed, failure]
Vendored
+1
View File
@@ -17,4 +17,5 @@ config.yml
# Output binary
minitor
minitor-go
dist/
+48
View File
@@ -0,0 +1,48 @@
---
linters:
enable:
- asciicheck
- bodyclose
- dogsled
- dupl
- exhaustive
- gochecknoinits
- gocognit
- gocritic
- gocyclo
- goerr113
- gofumpt
- goimports
- gomnd
- goprintffuncname
# - gosec
# - ifshort
- interfacer
- maligned
- misspell
- nakedret
- nestif
- nlreturn
- noctx
- unparam
- wsl
# - errorlint
disable:
- gochecknoglobals
linters-settings:
gosec:
excludes:
- G204
# gomnd:
# settings:
# mnd:
# ignored-functions: math.*
issues:
exclude-rules:
- path: _test\.go
linters:
- errcheck
- gosec
- maligned
+7 -9
View File
@@ -1,7 +1,7 @@
---
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0
rev: v3.4.0
hooks:
- id: check-added-large-files
- id: check-yaml
@@ -11,14 +11,12 @@ repos:
- id: end-of-file-fixer
- id: check-merge-conflict
- repo: git://github.com/dnephin/pre-commit-golang
rev: v0.3.5
rev: v0.4.0
hooks:
- id: go-fmt
- id: go-imports
# - id: gometalinter
# - id: golangci-lint
# - repo: https://github.com/IamTheFij/docker-pre-commit
# rev: v2.0.0
# hooks:
# - id: docker-compose-check
# - id: hadolint
- id: golangci-lint
- repo: https://github.com/hadolint/hadolint
rev: v2.4.0
hooks:
- id: hadolint
+1 -1
View File
@@ -46,7 +46,7 @@ test:
install-hooks:
pre-commit install --install-hooks
# Checks files for encryption
# Runs pre-commit checks on files
.PHONY: check
check:
pre-commit run --all-files
+14 -4
View File
@@ -54,7 +54,7 @@ The global configurations are:
|key|value|
|---|---|
|`check_interval`|Maximum frequency to run checks for each monitor|
|`check_interval`|Maximum frequency to run checks for each monitor as duration, eg. 1m2s.|
|`monitors`|List of all monitors. Detailed description below|
|`alerts`|List of all alerts. Detailed description below|
@@ -111,7 +111,7 @@ minitor -metrics -metrics-port 3000
## Contributing
Whether you're looking to submit a patch or just tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository.
Whether you're looking to submit a patch or tell me I broke something, you can contribute through the Github mirror and I can merge PRs back to the source repository.
Primary Repo: https://git.iamthefij.com/iamthefij/minitor.git
@@ -143,15 +143,25 @@ alerts:
command: 'echo {{.MonitorName}}'
```
Interval durations have changed from being an integer number of seconds to a duration string supported by Go, for example:
minitor-py:
```yaml
check_interval: 90
```
minitor-go:
```yaml
check_interval: 1m30s
```
For the time being, legacy configs for the Python version of Minitor should be compatible if you apply the `-py-compat` flag when running Minitor. Eventually, this flag will go away when later breaking changes are introduced.
## Future
Future, potentially breaking changes
- [ ] Implement leveled logging (maybe glog or logrus)
- [ ] Consider value of templating vs injecting values into Env variables
- [ ] Async checking
- [ ] Revisit metrics and see if they all make sense
- [ ] Consider dropping `alert_up` and `alert_down` in favor of using Go templates that offer more control of messaging (Breaking)
- [ ] Use durations rather than seconds checked in event loop (Potentially breaking)
+49 -19
View File
@@ -2,12 +2,21 @@ package main
import (
"bytes"
"errors"
"fmt"
"log"
"os/exec"
"strings"
"text/template"
"time"
"git.iamthefij.com/iamthefij/slog"
)
var (
errNoTemplate = errors.New("no template")
// ErrAlertFailed indicates that an alert failed to send
ErrAlertFailed = errors.New("alert failed")
)
// Alert is a config driven mechanism for sending a notice
@@ -20,12 +29,12 @@ type Alert struct {
// AlertNotice captures the context for an alert to be sent
type AlertNotice struct {
MonitorName string
AlertCount int16
FailureCount int16
LastCheckOutput string
LastSuccess time.Time
IsUp bool
LastSuccess time.Time
MonitorName string
LastCheckOutput string
}
// IsValid returns a boolean indicating if the Alert has been correctly
@@ -45,29 +54,33 @@ func (alert *Alert) BuildTemplates() error {
"{last_success}", "{{.LastSuccess}}",
"{monitor_name}", "{{.MonitorName}}",
)
if LogDebug {
log.Printf("DEBUG: Building template for alert %s", alert.Name)
}
if alert.commandTemplate == nil && alert.Command.Command != nil {
slog.Debugf("Building template for alert %s", alert.Name)
switch {
case alert.commandTemplate == nil && alert.Command.Command != nil:
alert.commandTemplate = []*template.Template{}
for i, cmdPart := range alert.Command.Command {
if PyCompat {
cmdPart = legacy.Replace(cmdPart)
}
alert.commandTemplate = append(alert.commandTemplate, template.Must(
template.New(alert.Name+string(i)).Parse(cmdPart),
template.New(alert.Name+fmt.Sprint(i)).Parse(cmdPart),
))
}
} else if alert.commandShellTemplate == nil && alert.Command.ShellCommand != "" {
case alert.commandShellTemplate == nil && alert.Command.ShellCommand != "":
shellCmd := alert.Command.ShellCommand
if PyCompat {
shellCmd = legacy.Replace(shellCmd)
}
alert.commandShellTemplate = template.Must(
template.New(alert.Name).Parse(shellCmd),
)
} else {
return fmt.Errorf("No template provided for alert %s", alert.Name)
default:
return fmt.Errorf("No template provided for alert %s: %w", alert.Name, errNoTemplate)
}
return nil
@@ -75,30 +88,40 @@ func (alert *Alert) BuildTemplates() error {
// Send will send an alert notice by executing the command template
func (alert Alert) Send(notice AlertNotice) (outputStr string, err error) {
log.Printf("INFO: Sending alert %s for %s", alert.Name, notice.MonitorName)
slog.Infof("Sending alert %s for %s", alert.Name, notice.MonitorName)
var cmd *exec.Cmd
if alert.commandTemplate != nil {
switch {
case alert.commandTemplate != nil:
command := []string{}
for _, cmdTmp := range alert.commandTemplate {
var commandBuffer bytes.Buffer
err = cmdTmp.Execute(&commandBuffer, notice)
if err != nil {
return
}
command = append(command, commandBuffer.String())
}
cmd = exec.Command(command[0], command[1:]...)
} else if alert.commandShellTemplate != nil {
case alert.commandShellTemplate != nil:
var commandBuffer bytes.Buffer
err = alert.commandShellTemplate.Execute(&commandBuffer, notice)
if err != nil {
return
}
shellCommand := commandBuffer.String()
cmd = ShellCommand(shellCommand)
} else {
err = fmt.Errorf("No templates compiled for alert %v", alert.Name)
default:
err = fmt.Errorf("No templates compiled for alert %s: %w", alert.Name, errNoTemplate)
return
}
@@ -110,8 +133,15 @@ func (alert Alert) Send(notice AlertNotice) (outputStr string, err error) {
var output []byte
output, err = cmd.CombinedOutput()
outputStr = string(output)
if LogDebug {
log.Printf("DEBUG: Alert output for: %s\n---\n%s\n---", alert.Name, outputStr)
slog.Debugf("Alert output for: %s\n---\n%s\n---", alert.Name, outputStr)
if err != nil {
err = fmt.Errorf(
"Alert '%s' failed to send. Returned %v: %w",
alert.Name,
err,
ErrAlertFailed,
)
}
return outputStr, err
+16 -1
View File
@@ -18,11 +18,13 @@ func TestAlertIsValid(t *testing.T) {
for _, c := range cases {
log.Printf("Testing case %s", c.name)
actual := c.alert.IsValid()
if actual != c.expected {
t.Errorf("IsValid(%v), expected=%t actual=%t", c.name, c.expected, actual)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -100,19 +102,28 @@ func TestAlertSend(t *testing.T) {
log.Printf("Testing case %s", c.name)
// Set PyCompat to value of compat flag
PyCompat = c.pyCompat
c.alert.BuildTemplates()
err := c.alert.BuildTemplates()
if err != nil {
t.Errorf("Send(%v output), error building templates: %v", c.name, err)
}
output, err := c.alert.Send(c.notice)
hasErr := (err != nil)
if output != c.expectedOutput {
t.Errorf("Send(%v output), expected=%v actual=%v", c.name, c.expectedOutput, output)
log.Printf("Case failed: %s", c.name)
}
if hasErr != c.expectErr {
t.Errorf("Send(%v err), expected=%v actual=%v", c.name, "Err", err)
log.Printf("Case failed: %s", c.name)
}
// Set PyCompat back to default value
PyCompat = false
log.Println("-----")
}
}
@@ -120,10 +131,12 @@ func TestAlertSend(t *testing.T) {
func TestAlertSendNoTemplates(t *testing.T) {
alert := Alert{}
notice := AlertNotice{}
output, err := alert.Send(notice)
if err == nil {
t.Errorf("Send(no template), expected=%v actual=%v", "Err", output)
}
log.Println("-----")
}
@@ -142,10 +155,12 @@ func TestAlertBuildTemplate(t *testing.T) {
log.Printf("Testing case %s", c.name)
err := c.alert.BuildTemplates()
hasErr := (err != nil)
if hasErr != c.expectErr {
t.Errorf("IsValid(%v), expected=%t actual=%t", c.name, c.expectErr, err)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
+63 -18
View File
@@ -3,14 +3,17 @@ package main
import (
"errors"
"io/ioutil"
"log"
"time"
"git.iamthefij.com/iamthefij/slog"
"gopkg.in/yaml.v2"
)
var errInvalidConfig = errors.New("Invalid configuration")
// Config type is contains all provided user configuration
type Config struct {
CheckInterval int64 `yaml:"check_interval"`
CheckInterval SecondsOrDuration `yaml:"check_interval"`
Monitors []*Monitor
Alerts map[string]*Alert
}
@@ -35,17 +38,48 @@ func (cos *CommandOrShell) UnmarshalYAML(unmarshal func(interface{}) error) erro
// Error indicates this is shell command
if err != nil {
var shellCmd string
err := unmarshal(&shellCmd)
if err != nil {
return err
}
cos.ShellCommand = shellCmd
} else {
cos.Command = cmd
}
return nil
}
// SecondsOrDuration wraps a duration value for parsing a duration or seconds from YAML
// NOTE: This should be removed in favor of only parsing durations once compatibility is broken
type SecondsOrDuration struct {
value time.Duration
}
// Value returns a duration value
func (sod SecondsOrDuration) Value() time.Duration {
return sod.value
}
// UnmarshalYAML allows unmarshalling a duration value or seconds if an int was provided
func (sod *SecondsOrDuration) UnmarshalYAML(unmarshal func(interface{}) error) error {
var seconds int64
err := unmarshal(&seconds)
if err == nil {
sod.value = time.Second * time.Duration(seconds)
return nil
}
// Error indicates that we don't have an int
err = unmarshal(&sod.value)
return err
}
// IsValid checks config validity and returns true if valid
func (config Config) IsValid() (isValid bool) {
isValid = true
@@ -53,41 +87,50 @@ func (config Config) IsValid() (isValid bool) {
// Validate alerts
if config.Alerts == nil || len(config.Alerts) == 0 {
// This should never happen because there is a default alert named 'log' for now
log.Printf("ERROR: Invalid alert configuration: Must provide at least one alert")
slog.Errorf("Invalid alert configuration: Must provide at least one alert")
isValid = false
}
for _, alert := range config.Alerts {
if !alert.IsValid() {
log.Printf("ERROR: Invalid alert configuration: %s", alert.Name)
slog.Errorf("Invalid alert configuration: %+v", alert.Name)
isValid = false
} else {
slog.Debugf("Loaded alert %s", alert.Name)
}
}
// Validate monitors
if config.Monitors == nil || len(config.Monitors) == 0 {
log.Printf("ERROR: Invalid monitor configuration: Must provide at least one monitor")
slog.Errorf("Invalid monitor configuration: Must provide at least one monitor")
isValid = false
}
for _, monitor := range config.Monitors {
if !monitor.IsValid() {
log.Printf("ERROR: Invalid monitor configuration: %s", monitor.Name)
slog.Errorf("Invalid monitor configuration: %s", monitor.Name)
isValid = false
}
// Check that all Monitor alerts actually exist
for _, isUp := range []bool{true, false} {
for _, alertName := range monitor.GetAlertNames(isUp) {
if _, ok := config.Alerts[alertName]; !ok {
log.Printf(
"ERROR: Invalid monitor configuration: %s. Unknown alert %s",
slog.Errorf(
"Invalid monitor configuration: %s. Unknown alert %s",
monitor.Name, alertName,
)
isValid = false
}
}
}
}
return
return isValid
}
// Init performs extra initialization on top of loading the config from file
@@ -114,28 +157,30 @@ func LoadConfig(filePath string) (config Config, err error) {
return
}
if LogDebug {
log.Printf("DEBUG: Config values:\n%v\n", config)
}
slog.Debugf("Config values:\n%v\n", config)
// Add log alert if not present
if PyCompat {
// Intialize alerts list if not present
// Initialize alerts list if not present
if config.Alerts == nil {
config.Alerts = map[string]*Alert{}
}
if _, ok := config.Alerts["log"]; !ok {
config.Alerts["log"] = NewLogAlert()
}
}
if !config.IsValid() {
err = errors.New("Invalid configuration")
// Finish initializing configuration
if err = config.Init(); err != nil {
return
}
// Finish initializing configuration
err = config.Init()
if !config.IsValid() {
err = errInvalidConfig
return
return
}
return config, err
}
+41 -1
View File
@@ -3,6 +3,7 @@ package main
import (
"log"
"testing"
"time"
)
func TestLoadConfig(t *testing.T) {
@@ -27,20 +28,50 @@ func TestLoadConfig(t *testing.T) {
PyCompat = c.pyCompat
_, err := LoadConfig(c.configPath)
hasErr := (err != nil)
if hasErr != c.expectErr {
t.Errorf("LoadConfig(%v), expected_error=%v actual=%v", c.name, c.expectErr, err)
log.Printf("Case failed: %s", c.name)
}
// Set PyCompat to default value
PyCompat = false
log.Println("-----")
}
}
func TestIntervalParsing(t *testing.T) {
log.Printf("Testing case TestIntervalParsing")
config, err := LoadConfig("./test/valid-config.yml")
if err != nil {
t.Errorf("Failed loading config: %v", err)
}
oneSecond := time.Second
tenSeconds := 10 * time.Second
oneMinute := time.Minute
// validate top level interval seconds represented as an int
if config.CheckInterval.Value() != oneSecond {
t.Errorf("Incorrectly parsed int seconds. expected=%v actual=%v", oneSecond, config.CheckInterval)
}
if config.Monitors[0].CheckInterval.Value() != tenSeconds {
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
}
if config.Monitors[1].CheckInterval.Value() != oneMinute {
t.Errorf("Incorrectly parsed seconds duration. expected=%v actual=%v", oneSecond, config.CheckInterval)
}
log.Println("-----")
}
// TestMultiLineConfig is a more complicated test stepping through the parsing
// and execution of mutli-line strings presented in YAML
func TestMultiLineConfig(t *testing.T) {
log.Println("Testing multi-line string config")
config, err := LoadConfig("./test/valid-verify-multi-line.yml")
if err != nil {
t.Fatalf("TestMultiLineConfig(load), expected=no_error actual=%v", err)
@@ -48,8 +79,10 @@ func TestMultiLineConfig(t *testing.T) {
log.Println("-----")
log.Println("TestMultiLineConfig(parse > string)")
expected := "echo 'Some string with stuff'; echo \"<angle brackets>\"; exit 1\n"
actual := config.Monitors[0].Command.ShellCommand
if expected != actual {
t.Errorf("TestMultiLineConfig(>) failed")
t.Logf("string expected=`%v`", expected)
@@ -60,12 +93,15 @@ func TestMultiLineConfig(t *testing.T) {
log.Println("-----")
log.Println("TestMultiLineConfig(execute > string)")
_, notice := config.Monitors[0].Check()
if notice == nil {
t.Fatalf("Did not receive an alert notice")
}
expected = "Some string with stuff\n<angle brackets>\n"
actual = notice.LastCheckOutput
if expected != actual {
t.Errorf("TestMultiLineConfig(execute > string) check failed")
t.Logf("string expected=`%v`", expected)
@@ -76,8 +112,10 @@ func TestMultiLineConfig(t *testing.T) {
log.Println("-----")
log.Println("TestMultiLineConfig(parse | string)")
expected = "echo 'Some string with stuff'\necho '<angle brackets>'\n"
actual = config.Alerts["log_shell"].Command.ShellCommand
if expected != actual {
t.Errorf("TestMultiLineConfig(|) failed")
t.Logf("string expected=`%v`", expected)
@@ -88,10 +126,12 @@ func TestMultiLineConfig(t *testing.T) {
log.Println("-----")
log.Println("TestMultiLineConfig(execute | string)")
actual, err = config.Alerts["log_shell"].Send(AlertNotice{})
if err != nil {
t.Errorf("Execution of alert failed")
}
expected = "Some string with stuff\n<angle brackets>\n"
if expected != actual {
t.Errorf("TestMultiLineConfig(execute | string) check failed")
+2 -1
View File
@@ -1,8 +1,9 @@
module git.iamthefij.com/iamthefij/minitor-go
go 1.12
go 1.15
require (
git.iamthefij.com/iamthefij/slog v1.3.0
github.com/prometheus/client_golang v1.2.1
gopkg.in/yaml.v2 v2.2.4
)
+2
View File
@@ -1,3 +1,5 @@
git.iamthefij.com/iamthefij/slog v1.3.0 h1:4Hu5PQvDrW5e3FrTS3q2iIXW0iPvhNY/9qJsqDR3K3I=
git.iamthefij.com/iamthefij/slog v1.3.0/go.mod h1:1RUj4hcCompZkAxXCRfUX786tb3cM/Zpkn97dGfUfbg=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+56 -54
View File
@@ -1,16 +1,15 @@
package main
import (
"errors"
"flag"
"fmt"
"log"
"time"
"git.iamthefij.com/iamthefij/slog"
)
var (
// LogDebug will control whether debug messsages should be logged
LogDebug = false
// ExportMetrics will track whether or not we want to export metrics to prometheus
ExportMetrics = false
// MetricsPort is the port to expose metrics on
@@ -23,8 +22,49 @@ var (
// version of minitor being run
version = "dev"
errUnknownAlert = errors.New("unknown alert")
)
func sendAlerts(config *Config, monitor *Monitor, alertNotice *AlertNotice) error {
slog.Debugf("Received an alert notice from %s", alertNotice.MonitorName)
alertNames := monitor.GetAlertNames(alertNotice.IsUp)
if alertNames == nil {
// This should only happen for a recovery alert. AlertDown is validated not empty
slog.Warningf(
"Received alert, but no alert mechanisms exist. MonitorName=%s IsUp=%t",
alertNotice.MonitorName, alertNotice.IsUp,
)
}
for _, alertName := range alertNames {
if alert, ok := config.Alerts[alertName]; ok {
output, err := alert.Send(*alertNotice)
if err != nil {
slog.Errorf(
"Alert '%s' failed. result=%v: output=%s",
alert.Name,
err,
output,
)
return err
}
// Count alert metrics
Metrics.CountAlert(monitor.Name, alert.Name)
} else {
// This case should never actually happen since we validate against it
slog.Errorf("Unknown alert for monitor %s: %s", alertNotice.MonitorName, alertName)
return fmt.Errorf("unknown alert for monitor %s: %s: %w", alertNotice.MonitorName, alertName, errUnknownAlert)
}
}
return nil
}
func checkMonitors(config *Config) error {
for _, monitor := range config.Monitors {
if monitor.ShouldCheck() {
@@ -34,47 +74,10 @@ func checkMonitors(config *Config) error {
// Track status metrics
Metrics.SetMonitorStatus(monitor.Name, monitor.IsUp())
Metrics.CountCheck(monitor.Name, success, hasAlert)
Metrics.CountCheck(monitor.Name, success, monitor.LastCheckMilliseconds(), hasAlert)
// Should probably consider refactoring everything below here
if alertNotice != nil {
if LogDebug {
log.Printf("DEBUG: Recieved an alert notice from %s", alertNotice.MonitorName)
}
alertNames := monitor.GetAlertNames(alertNotice.IsUp)
if alertNames == nil {
// This should only happen for a recovery alert. AlertDown is validated not empty
log.Printf(
"WARNING: Recieved alert, but no alert mechanisms exist. MonitorName=%s IsUp=%t",
alertNotice.MonitorName, alertNotice.IsUp,
)
}
for _, alertName := range alertNames {
if alert, ok := config.Alerts[alertName]; ok {
output, err := alert.Send(*alertNotice)
if err != nil {
log.Printf(
"ERROR: Alert '%s' failed. result=%v: output=%s",
alert.Name,
err,
output,
)
return fmt.Errorf(
"Unsuccessfully triggered alert '%s'. "+
"Crashing to avoid false negatives: %v",
alert.Name,
err,
)
}
// Count alert metrics
Metrics.CountAlert(monitor.Name, alert.Name)
} else {
// This case should never actually happen since we validate against it
log.Printf("ERROR: Unknown alert for monitor %s: %s", alertNotice.MonitorName, alertName)
return fmt.Errorf("Unknown alert for monitor %s: %s", alertNotice.MonitorName, alertName)
}
}
return sendAlerts(config, monitor, alertNotice)
}
}
}
@@ -83,30 +86,30 @@ func checkMonitors(config *Config) error {
}
func main() {
// Get debug flag
flag.BoolVar(&LogDebug, "debug", false, "Enables debug logs (default: false)")
showVersion := flag.Bool("version", false, "Display the version of minitor and exit")
configPath := flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
flag.BoolVar(&slog.DebugLevel, "debug", false, "Enables debug logs (default: false)")
flag.BoolVar(&ExportMetrics, "metrics", false, "Enables prometheus metrics exporting (default: false)")
flag.BoolVar(&PyCompat, "py-compat", false, "Enables support for legacy Python Minitor config. Will eventually be removed. (default: false)")
flag.IntVar(&MetricsPort, "metrics-port", 8080, "The port that Prometheus metrics should be exported on, if enabled. (default: 8080)")
var showVersion = flag.Bool("version", false, "Display the version of minitor and exit")
var configPath = flag.String("config", "config.yml", "Alternate configuration path (default: config.yml)")
flag.Parse()
// Print version if flag is provided
if *showVersion {
log.Println("Minitor version:", version)
fmt.Println("Minitor version:", version)
return
}
// Load configuration
config, err := LoadConfig(*configPath)
if err != nil {
log.Fatalf("Error loading config: %v", err)
}
slog.OnErrFatalf(err, "Error loading config: %v", err)
// Serve metrics exporter, if specified
if ExportMetrics {
log.Println("INFO: Exporting metrics to Prometheus")
slog.Infof("Exporting metrics to Prometheus on port %d", MetricsPort)
go ServeMetrics()
}
@@ -117,7 +120,6 @@ func main() {
panic(err)
}
sleepTime := time.Duration(config.CheckInterval) * time.Second
time.Sleep(sleepTime)
time.Sleep(config.CheckInterval.Value())
}
}
+33 -11
View File
@@ -33,16 +33,10 @@ func TestCheckMonitors(t *testing.T) {
Command: CommandOrShell{Command: []string{"false"}},
AlertAfter: 1,
},
&Monitor{
Name: "Failure",
Command: CommandOrShell{Command: []string{"false"}},
AlertDown: []string{"unknown"},
AlertAfter: 1,
},
},
},
expectErr: false,
name: "Monitor failure, no and unknown alerts",
name: "Monitor failure, no alerts",
},
{
config: Config{
@@ -52,6 +46,28 @@ func TestCheckMonitors(t *testing.T) {
Command: CommandOrShell{Command: []string{"ls"}},
alertCount: 1,
},
},
},
expectErr: false,
name: "Monitor recovery, no alerts",
},
{
config: Config{
Monitors: []*Monitor{
&Monitor{
Name: "Failure",
Command: CommandOrShell{Command: []string{"false"}},
AlertDown: []string{"unknown"},
AlertAfter: 1,
},
},
},
expectErr: true,
name: "Monitor failure, unknown alerts",
},
{
config: Config{
Monitors: []*Monitor{
&Monitor{
Name: "Success",
Command: CommandOrShell{Command: []string{"true"}},
@@ -60,8 +76,8 @@ func TestCheckMonitors(t *testing.T) {
},
},
},
expectErr: false,
name: "Monitor recovery, no alerts",
expectErr: true,
name: "Monitor recovery, unknown alerts",
},
{
config: Config{
@@ -105,10 +121,16 @@ func TestCheckMonitors(t *testing.T) {
}
for _, c := range cases {
c.config.Init()
err := checkMonitors(&c.config)
err := c.config.Init()
if err != nil {
t.Errorf("checkMonitors(%s): unexpected error reading config: %v", c.name, err)
}
err = checkMonitors(&c.config)
if err == nil && c.expectErr {
t.Errorf("checkMonitors(%s): Expected panic, the code did not panic", c.name)
} else if err != nil && !c.expectErr {
t.Errorf("checkMonitors(%s): Did not expect an error, but we got one anyway: %v", c.name, err)
}
}
}
+17 -1
View File
@@ -19,6 +19,7 @@ import (
type MinitorMetrics struct {
alertCount *prometheus.CounterVec
checkCount *prometheus.CounterVec
checkTime *prometheus.GaugeVec
monitorStatus *prometheus.GaugeVec
}
@@ -40,6 +41,13 @@ func NewMetrics() *MinitorMetrics {
},
[]string{"monitor", "status", "is_alert"},
),
checkTime: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "minitor_check_milliseconds",
Help: "Time in miliseconds that a check ran for",
},
[]string{"monitor", "status"},
),
monitorStatus: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "minitor_monitor_up_count",
@@ -52,6 +60,7 @@ func NewMetrics() *MinitorMetrics {
// Register newly created metrics
prometheus.MustRegister(metrics.alertCount)
prometheus.MustRegister(metrics.checkCount)
prometheus.MustRegister(metrics.checkTime)
prometheus.MustRegister(metrics.monitorStatus)
return metrics
@@ -63,11 +72,12 @@ func (metrics *MinitorMetrics) SetMonitorStatus(monitor string, isUp bool) {
if isUp {
val = 1.0
}
metrics.monitorStatus.With(prometheus.Labels{"monitor": monitor}).Set(val)
}
// CountCheck counts the result of a particular Monitor check
func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAlert bool) {
func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, ms int64, isAlert bool) {
status := "failure"
if isSuccess {
status = "success"
@@ -81,6 +91,10 @@ func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAler
metrics.checkCount.With(
prometheus.Labels{"monitor": monitor, "status": status, "is_alert": alertVal},
).Inc()
metrics.checkTime.With(
prometheus.Labels{"monitor": monitor, "status": status},
).Set(float64(ms))
}
// CountAlert counts an alert
@@ -96,6 +110,8 @@ func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
// ServeMetrics starts an http server with a Prometheus metrics handler
func ServeMetrics() {
http.Handle("/metrics", promhttp.Handler())
host := fmt.Sprintf(":%d", MetricsPort)
_ = http.ListenAndServe(host, nil)
}
+46 -37
View File
@@ -1,28 +1,31 @@
package main
import (
"log"
"math"
"os/exec"
"time"
"git.iamthefij.com/iamthefij/slog"
)
// Monitor represents a particular periodic check of a command
type Monitor struct {
type Monitor struct { //nolint:maligned
// Config values
AlertAfter int16 `yaml:"alert_after"`
AlertEvery int16 `yaml:"alert_every"`
CheckInterval SecondsOrDuration `yaml:"check_interval"`
Name string
Command CommandOrShell
AlertDown []string `yaml:"alert_down"`
AlertUp []string `yaml:"alert_up"`
CheckInterval float64 `yaml:"check_interval"`
AlertAfter int16 `yaml:"alert_after"`
AlertEvery int16 `yaml:"alert_every"`
Command CommandOrShell
// Other values
lastCheck time.Time
lastOutput string
alertCount int16
failureCount int16
lastSuccess time.Time
alertCount int16
failureCount int16
lastCheck time.Time
lastSuccess time.Time
lastOutput string
lastCheckDuration time.Duration
}
// IsValid returns a boolean indicating if the Monitor has been correctly
@@ -40,8 +43,9 @@ func (monitor Monitor) ShouldCheck() bool {
return true
}
sinceLastCheck := time.Now().Sub(monitor.lastCheck).Seconds()
return sinceLastCheck >= monitor.CheckInterval
sinceLastCheck := time.Since(monitor.lastCheck)
return sinceLastCheck >= monitor.CheckInterval.Value()
}
// Check will run the command configured by the Monitor and return a status
@@ -54,11 +58,14 @@ func (monitor *Monitor) Check() (bool, *AlertNotice) {
cmd = ShellCommand(monitor.Command.ShellCommand)
}
checkStartTime := time.Now()
output, err := cmd.CombinedOutput()
monitor.lastCheck = time.Now()
monitor.lastOutput = string(output)
monitor.lastCheckDuration = monitor.lastCheck.Sub(checkStartTime)
var alertNotice *AlertNotice
isSuccess := (err == nil)
if isSuccess {
alertNotice = monitor.success()
@@ -66,17 +73,11 @@ func (monitor *Monitor) Check() (bool, *AlertNotice) {
alertNotice = monitor.failure()
}
if LogDebug {
log.Printf("DEBUG: Command output: %s", monitor.lastOutput)
}
if err != nil {
if LogDebug {
log.Printf("DEBUG: Command result: %v", err)
}
}
slog.Debugf("Command output: %s", monitor.lastOutput)
slog.OnErrWarnf(err, "Command result: %v", err)
log.Printf(
"INFO: %s success=%t, alert=%t",
slog.Infof(
"%s success=%t, alert=%t",
monitor.Name,
isSuccess,
alertNotice != nil,
@@ -90,11 +91,17 @@ func (monitor Monitor) IsUp() bool {
return monitor.alertCount == 0
}
// LastCheckMilliseconds gives number of miliseconds the last check ran for
func (monitor Monitor) LastCheckMilliseconds() int64 {
return monitor.lastCheckDuration.Milliseconds()
}
func (monitor *Monitor) success() (notice *AlertNotice) {
if !monitor.IsUp() {
// Alert that we have recovered
notice = monitor.createAlertNotice(true)
}
monitor.failureCount = 0
monitor.alertCount = 0
monitor.lastSuccess = time.Now()
@@ -106,15 +113,14 @@ func (monitor *Monitor) failure() (notice *AlertNotice) {
monitor.failureCount++
// If we haven't hit the minimum failures, we can exit
if monitor.failureCount < monitor.getAlertAfter() {
if LogDebug {
log.Printf(
"DEBUG: %s failed but did not hit minimum failures. "+
"Count: %v alert after: %v",
monitor.Name,
monitor.failureCount,
monitor.getAlertAfter(),
)
}
slog.Debugf(
"%s failed but did not hit minimum failures. "+
"Count: %v alert after: %v",
monitor.Name,
monitor.failureCount,
monitor.getAlertAfter(),
)
return
}
@@ -122,19 +128,20 @@ func (monitor *Monitor) failure() (notice *AlertNotice) {
failureCount := (monitor.failureCount - monitor.getAlertAfter())
// Use alert cadence to determine if we should alert
if monitor.AlertEvery > 0 {
switch {
case monitor.AlertEvery > 0:
// Handle integer number of failures before alerting
if failureCount%monitor.AlertEvery == 0 {
notice = monitor.createAlertNotice(false)
}
} else if monitor.AlertEvery == 0 {
case monitor.AlertEvery == 0:
// Handle alerting on first failure only
if failureCount == 0 {
notice = monitor.createAlertNotice(false)
}
} else {
default:
// Handle negative numbers indicating an exponential backoff
if failureCount >= int16(math.Pow(2, float64(monitor.alertCount))-1) {
if failureCount >= int16(math.Pow(2, float64(monitor.alertCount))-1) { //nolint:gomnd
notice = monitor.createAlertNotice(false)
}
}
@@ -144,7 +151,7 @@ func (monitor *Monitor) failure() (notice *AlertNotice) {
monitor.alertCount++
}
return
return notice
}
func (monitor Monitor) getAlertAfter() int16 {
@@ -153,6 +160,7 @@ func (monitor Monitor) getAlertAfter() int16 {
if monitor.AlertAfter == 0 {
return 1
}
return monitor.AlertAfter
}
@@ -161,6 +169,7 @@ func (monitor Monitor) GetAlertNames(up bool) []string {
if up {
return monitor.AlertUp
}
return monitor.AlertDown
}
+22 -3
View File
@@ -22,11 +22,13 @@ func TestMonitorIsValid(t *testing.T) {
for _, c := range cases {
log.Printf("Testing case %s", c.name)
actual := c.monitor.IsValid()
if actual != c.expected {
t.Errorf("IsValid(%v), expected=%t actual=%t", c.name, c.expected, actual)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -43,9 +45,9 @@ func TestMonitorShouldCheck(t *testing.T) {
name string
}{
{Monitor{}, true, "Empty"},
{Monitor{lastCheck: timeNow, CheckInterval: 15}, false, "Just checked"},
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: 15}, false, "-10s"},
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: 15}, true, "-20s"},
{Monitor{lastCheck: timeNow, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "Just checked"},
{Monitor{lastCheck: timeTenSecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, false, "-10s"},
{Monitor{lastCheck: timeTwentySecAgo, CheckInterval: SecondsOrDuration{time.Second * 15}}, true, "-20s"},
}
for _, c := range cases {
@@ -71,11 +73,13 @@ func TestMonitorIsUp(t *testing.T) {
for _, c := range cases {
log.Printf("Testing case %s", c.name)
actual := c.monitor.IsUp()
if actual != c.expected {
t.Errorf("IsUp(%v), expected=%t actual=%t", c.name, c.expected, actual)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -96,11 +100,13 @@ func TestMonitorGetAlertNames(t *testing.T) {
for _, c := range cases {
log.Printf("Testing case %s", c.name)
actual := c.monitor.GetAlertNames(c.up)
if !EqualSliceString(actual, c.expected) {
t.Errorf("GetAlertNames(%v), expected=%v actual=%v", c.name, c.expected, actual)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -119,12 +125,15 @@ func TestMonitorSuccess(t *testing.T) {
for _, c := range cases {
log.Printf("Testing case %s", c.name)
notice := c.monitor.success()
hasNotice := (notice != nil)
if hasNotice != c.expectNotice {
t.Errorf("success(%v), expected=%t actual=%t", c.name, c.expectNotice, hasNotice)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -147,12 +156,15 @@ func TestMonitorFailureAlertAfter(t *testing.T) {
for _, c := range cases {
log.Printf("Testing case %s", c.name)
notice := c.monitor.failure()
hasNotice := (notice != nil)
if hasNotice != c.expectNotice {
t.Errorf("failure(%v), expected=%t actual=%t", c.name, c.expectNotice, hasNotice)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -195,10 +207,12 @@ func TestMonitorFailureAlertEvery(t *testing.T) {
notice := c.monitor.failure()
hasNotice := (notice != nil)
if hasNotice != c.expectNotice {
t.Errorf("failure(%v), expected=%t actual=%t", c.name, c.expectNotice, hasNotice)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -223,15 +237,18 @@ func TestMonitorFailureExponential(t *testing.T) {
// Unlike previous tests, this one requires a static Monitor with repeated
// calls to the failure method
monitor := Monitor{failureCount: 0, AlertAfter: 1, AlertEvery: -1}
for _, c := range cases {
log.Printf("Testing case %s", c.name)
notice := monitor.failure()
hasNotice := (notice != nil)
if hasNotice != c.expectNotice {
t.Errorf("failure(%v), expected=%t actual=%t", c.name, c.expectNotice, hasNotice)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
@@ -243,6 +260,7 @@ func TestMonitorCheck(t *testing.T) {
hasNotice bool
lastOutput string
}
cases := []struct {
monitor Monitor
expect expected
@@ -290,6 +308,7 @@ func TestMonitorCheck(t *testing.T) {
t.Errorf("Check(%v) (output), expected=%v actual=%v", c.name, c.expect.lastOutput, lastOutput)
log.Printf("Case failed: %s", c.name)
}
log.Println("-----")
}
}
+6 -5
View File
@@ -3,14 +3,14 @@ check_interval: 5
monitors:
- name: Fake Website
command: ['curl', '-s', '-o', '/dev/null', 'https://minitor.mon']
command: ["curl", "-s", "-o", "/dev/null", "https://minitor.mon"]
alert_down: [log_down, mailgun_down, sms_down]
alert_up: [log_up, email_up]
check_interval: 10 # Must be at minimum the global `check_interval`
check_interval: 10 # Must be at minimum the global `check_interval`
alert_after: 3
alert_every: -1 # Defaults to -1 for exponential backoff. 0 to disable repeating
alert_every: -1 # Defaults to -1 for exponential backoff. 0 to disable repeating
- name: Real Website
command: ['curl', '-s', '-o', '/dev/null', 'https://google.com']
command: ["curl", "-s", "-o", "/dev/null", "https://google.com"]
alert_down: [log_down, mailgun_down, sms_down]
alert_up: [log_up, email_up]
check_interval: 5
@@ -23,7 +23,8 @@ alerts:
log_up:
command: ["echo", "Minitor recovery for {{.MonitorName}}"]
email_up:
command: [sendmail, "me@minitor.mon", "Recovered: {monitor_name}", "We're back!"]
command:
[sendmail, "me@minitor.mon", "Recovered: {monitor_name}", "We're back!"]
mailgun_down:
command: >
curl -s -X POST
+15 -3
View File
@@ -11,6 +11,7 @@ set -e
# To override, export DOCKER_HOST to a new hostname
DOCKER_HOST="${DOCKER_HOST:=socket}"
container_name="$1"
num_log_lines="$2"
# Curls Docker either using a socket or URL
function curl_docker {
@@ -31,21 +32,32 @@ function get_container_id {
# Returns container JSON
function inspect_container {
local container_id=$1
local container_id="$1"
curl_docker "containers/$container_id/json"
}
# Gets some lines from docker log
function get_logs {
container_id="$1"
num_lines="$2"
curl_docker "containers/$container_id/logs?stdout=1&stderr=1" | tail -n "$num_lines"
}
if [ -z "$container_name" ]; then
echo "Usage: $0 container_name"
echo "Usage: $0 container_name [num_log_lines]"
echo "Will exit with the last status code of continer with provided name"
exit 1
fi
container_id=$(get_container_id $container_name)
container_id=$(get_container_id "$container_name")
if [ -z "$container_id" ]; then
echo "ERROR: Could not find container with name: $container_name"
exit 1
fi
exit_code=$(inspect_container "$container_id" | jq -r .State.ExitCode)
if [ -n "$num_log_lines" ]; then
get_logs "$container_id" "$num_log_lines"
fi
exit "$exit_code"
+13 -1
View File
@@ -11,6 +11,7 @@ set -e
# To override, export DOCKER_HOST to a new hostname
DOCKER_HOST="${DOCKER_HOST:=socket}"
container_name="$1"
num_log_lines="$2"
# Curls Docker either using a socket or URL
function curl_docker {
@@ -35,8 +36,15 @@ function inspect_container {
curl_docker "containers/$container_id/json"
}
# Gets some lines from docker log
function get_logs {
container_id="$1"
num_lines="$2"
curl_docker "containers/$container_id/logs?stdout=1&stderr=1" | tail -n "$num_lines"
}
if [ -z "$container_name" ]; then
echo "Usage: $0 container_name"
echo "Usage: $0 container_name [num_log_lines]"
echo "Will return results of healthcheck for continer with provided name"
exit 1
fi
@@ -48,6 +56,10 @@ if [ -z "$container_id" ]; then
fi
health=$(inspect_container "$container_id" | jq -r '.State.Health.Status')
if [ -n "$num_log_lines" ]; then
get_logs "$container_id" "$num_log_lines"
fi
case "$health" in
null)
echo "No healthcheck results"
+6 -4
View File
@@ -3,21 +3,23 @@ check_interval: 1
monitors:
- name: Command
command: ['echo', '$PATH']
alert_down: ['log_command', 'log_shell']
command: ["echo", "$PATH"]
alert_down: ["log_command", "log_shell"]
alert_every: 0
check_interval: 10s
- name: Shell
command: >
echo 'Some string with stuff';
echo 'another line';
echo $PATH;
exit 1
alert_down: ['log_command', 'log_shell']
alert_down: ["log_command", "log_shell"]
alert_after: 5
alert_every: 0
check_interval: 1m
alerts:
log_command:
command: ['echo', 'regular', '"command!!!"', "{{.MonitorName}}"]
command: ["echo", "regular", '"command!!!"', "{{.MonitorName}}"]
log_shell:
command: echo "Failure on {{.MonitorName}} User is $USER"
+3 -1
View File
@@ -8,7 +8,7 @@ import (
// ShellCommand takes a string and executes it as a command using `sh`
func ShellCommand(command string) *exec.Cmd {
shellCommand := []string{"sh", "-c", strings.TrimSpace(command)}
//log.Printf("Shell command: %v", shellCommand)
return exec.Command(shellCommand[0], shellCommand[1:]...)
}
@@ -17,10 +17,12 @@ func EqualSliceString(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i, val := range a {
if val != b[i] {
return false
}
}
return true
}