WIP: Begin adding prometheus metrics exporting

2019-11-15 11:25:21 -08:00
15 changed files with 51 additions and 405 deletions
@@ -1,59 +1,13 @@
 ---
 kind: pipeline
 name: test
 steps:
  - name: build
    image: golang:1.12
    commands:
      - make build
  - name: test
    image: golang:1.12
    commands:
      - make build
      - make test
  - name: check
    image: python:3
    commands:
      - pip install pre-commit==1.20.0
      - make check
  - name: notify
    image: drillster/drone-email
    settings:
      host:
        from_secret: SMTP_HOST
      username:
        from_secret: SMTP_USER
      password:
        from_secret: SMTP_PASS
      from: drone@iamthefij.com
    when:
      status: [changed, failure]
 ---
 kind: pipeline
 name: publish
 depends_on:
  - test
 trigger:
  event:
    - push
    - tag
  refs:
    - refs/heads/master
    - refs/tags/v*
 steps:
  # Might consider moving this step into the previous pipeline
  - name: push image
    image: plugins/docker
    settings:
      repo: iamthefij/minitor-go
      dockerfile: Dockerfile.multi-stage
      auto_tag: true
      username:
        from_secret: docker_username
      password:
        from_secret: docker_password
@@ -1,19 +0,0 @@
 ---
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v2.4.0
    hooks:
      - id: check-added-large-files
      - id: check-yaml
        args:
          - --allow-multiple-documents
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-merge-conflict
  - repo: git://github.com/dnephin/pre-commit-golang
    rev: v0.3.5
    hooks:
      - id: go-fmt
      - id: go-imports
      # - id: gometalinter
      # - id: golangci-lint
@@ -1,24 +1,8 @@
 ARG REPO=library
-FROM ${REPO}/alpine:3.10
+FROM ${REPO}/busybox:latest
-RUN mkdir /app
+WORKDIR /root/
 WORKDIR /app/
 # Copy minitor in
 ARG ARCH=amd64
 COPY ./minitor-go ./minitor
 # Add common checking tools
 RUN apk --no-cache add bash=~5.0 curl=~7.66 jq=~1.6
 # Add minitor user for running as non-root
 RUN addgroup -S minitor && adduser -S minitor -G minitor
 # Copy scripts
 COPY ./scripts /app/scripts
 RUN chown -R minitor:minitor /app
 RUN chmod -R 755 /app/scripts
 # Drop to non-root user
 USER minitor
 ENTRYPOINT [ "./minitor" ]
@@ -1,7 +1,7 @@
 ARG REPO=library
 FROM golang:1.12-alpine AS builder
-RUN apk add --no-cache git=~2
+RUN apk add --no-cache git
 RUN mkdir /app
 WORKDIR /app
@@ -16,27 +16,8 @@ ARG VERSION=dev
 ENV CGO_ENABLED=0 GOOS=linux GOARCH=${ARCH}
 RUN go build -ldflags "-X main.version=${VERSION}" -a -installsuffix nocgo -o minitor .
-FROM ${REPO}/alpine:3.10
+FROM ${REPO}/busybox:latest
-RUN mkdir /app
+WORKDIR /root/
 WORKDIR /app/
 # Copy minitor in
 COPY --from=builder /app/minitor .
 # Add common checking tools
 RUN apk --no-cache add bash=~5.0 curl=~7.66 jq=~1.6
 # Add minitor user for running as non-root
 RUN addgroup -S minitor && adduser -S minitor -G minitor
 # Copy scripts
 COPY ./scripts /app/scripts
 RUN chown -R minitor:minitor /app
 RUN chmod -R 755 /app/scripts
 # Drop to non-root user
 USER minitor
 ENTRYPOINT [ "./minitor" ]
 # vim: set filetype=dockerfile:
@@ -1,7 +1,6 @@
 .PHONY: all
 DOCKER_TAG ?= minitor-go-${USER}
-.PHONY: default
+.PHONY: test
 default: test
 .PHONY: build
@@ -15,10 +14,6 @@ minitor-go:
 run: minitor-go build
 	./minitor-go -debug
 .PHONY: run-metrics
 run-metrics: minitor-go build
 	./minitor-go -debug -metrics
 .PHONY: test
 test:
 	go test -coverprofile=coverage.out
@@ -29,15 +24,6 @@ test:
 	@go tool cover -func=coverage.out | awk -v target=80.0% \
 		'/^total:/ { print "Total coverage: " $$3 " Minimum coverage: " target; if ($$3+0.0 >= target+0.0) print "ok"; else { print "fail"; exit 1; } }'
 # Installs pre-commit hooks
 .PHONY: install-hooks
 install-hooks:
 	pre-commit install --install-hooks
 # Checks files for encryption
 .PHONY: check
 check:
 	pre-commit run --all-files
 .PHONY: clean
 clean:
@@ -2,7 +2,7 @@
 A reimplementation of [Minitor](https://git.iamthefij/iamthefij/minitor) in Go
-Minitor is already a minimal monitoring tool. Python 3 was a quick way to get something live, but Python itself comes with a large footprint. Thus Go feels like a better fit for the project, longer term.
+Minitor is already a very minimal monitoring tool. Python 3 was a quick way to get something live, but Python itself comes with a very large footprint.Thus Go feels like a better fit for the project, longer term.
 Initial target is meant to be roughly compatible requiring only minor changes to configuration. Future iterations may diverge to take advantage of Go specific features.
@@ -30,7 +30,7 @@ monitors:
    command_shell: echo 'test'
 ```
-Second, templating for Alert messages has been updated. In the Python version, `str.format(...)` was used with certain keys passed in that could be used to format messages. In the Go version, we use a struct, `AlertNotice` defined in `alert.go` and the built in Go templating format. Eg.
+Second, templating for Alert messages has been updated. In the Python version, `str.format(...)` was used with certain keys passed in that could be used to format messages. In the Go version, we use a struct containing Alert info and the built in Go templating format. Eg.
 minitor-py:
 ```yaml
@@ -38,7 +38,7 @@ alerts:
  log_command:
    command: ['echo', '{monitor_name}']
  log_shell:
-    command_shell: 'echo {monitor_name}'
+    command_shell: "echo {monitor_name}"
 ```
 minitor-go:
@@ -47,7 +47,7 @@ alerts:
  log_command:
    command: ['echo', '{{.MonitorName}}']
  log_shell:
-    command_shell: 'echo {{.MonitorName}}'
+    command_shell: "echo {{.MonitorName}}"
 ```
 Finally, newlines in a shell command don't terminate a particular command. Semicolons must be used and continuations should not.
@@ -84,11 +84,10 @@ Pairity:
  - [x] Run alert commands
  - [x] Run alert commands in a shell
  - [x] Allow templating of alert commands
-  - [x] Implement Prometheus client to export metrics
+  - [ ] Implement Prometheus client to export metrics
-  - [x] Test coverage
+  - [ ] Test coverage
  - [ ] Integration testing (manual or otherwise)
-Improvement (potentially breaking):
+Improvement:
  - [ ] Implement leveled logging (maybe glog or logrus)
  - [ ] Consider switching from YAML to TOML
@@ -96,4 +95,3 @@ Improvement (potentially breaking):
  - [ ] Consider dropping `alert_up` and `alert_down` in favor of using Go templates that offer more control of messaging
  - [ ] Async checking
  - [ ] Use durations rather than seconds checked in event loop
  - [ ] Revisit metrics and see if they all make sense
@@ -2,11 +2,10 @@ package main
 import (
 	"errors"
 	"gopkg.in/yaml.v2"
 	"io/ioutil"
 	"log"
 	"os"
 	"gopkg.in/yaml.v2"
 )
 // Config type is contains all provided user configuration
@@ -85,9 +84,7 @@ func LoadConfig(filePath string) (config Config, err error) {
 		return
 	}
-	if LogDebug {
+	log.Printf("config:\n%v\n", config)
 		log.Printf("DEBUG: Config values:\n%v\n", config)
 	}
 	if !config.IsValid() {
 		err = errors.New("Invalid configuration")
@@ -3,7 +3,9 @@ package main
 import (
 	"flag"
 	"fmt"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"log"
 	"net/http"
 	"time"
 )
@@ -13,10 +15,6 @@ var (
 	// ExportMetrics will track whether or not we want to export metrics to prometheus
 	ExportMetrics = false
 	// MetricsPort is the port to expose metrics on
 	MetricsPort = 8080
 	// Metrics contains all active metrics
 	Metrics = NewMetrics()
 	// version of minitor being run
 	version = "dev"
@@ -25,13 +23,7 @@ var (
 func checkMonitors(config *Config) error {
 	for _, monitor := range config.Monitors {
 		if monitor.ShouldCheck() {
-			success, alertNotice := monitor.Check()
+			_, alertNotice := monitor.Check()
 			hasAlert := alertNotice != nil
 			// Track status metrics
 			Metrics.SetMonitorStatus(monitor.Name, success)
 			Metrics.CountCheck(monitor.Name, success, hasAlert)
 			// Should probably consider refactoring everything below here
 			if alertNotice != nil {
@@ -63,9 +55,6 @@ func checkMonitors(config *Config) error {
 								err,
 							)
 						}
 						// Count alert metrics
 						Metrics.CountAlert(monitor.Name, alert.Name)
 					} else {
 						// This case should never actually happen since we validate against it
 						log.Printf("ERROR: Unknown alert for monitor %s: %s", alertNotice.MonitorName, alertName)
@@ -79,6 +68,11 @@ func checkMonitors(config *Config) error {
 	return nil
 }
 func serveMetrics() {
 	http.Handle("/metrics", promhttp.Handler())
 	_ = http.ListenAndServe(":8080", nil)
 }
 func main() {
 	// Get debug flag
 	flag.BoolVar(&LogDebug, "debug", false, "Enables debug logs (default: false)")
@@ -101,7 +95,7 @@ func main() {
 	// Serve metrics exporter, if specified
 	if ExportMetrics {
 		log.Println("INFO: Exporting metrics to Prometheus")
-		go ServeMetrics()
+		go serveMetrics()
 	}
 	// Start main loop
@@ -1,101 +0,0 @@
 package main
 import (
 	"fmt"
 	"net/http"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 )
 // TODO: Not sure if this is the best way to handle. A global instance for
 // metrics isn't bad, but it might be nice to curry versions of the metrics
 // for each monitor. Especially since every monitor has it's own. Perhaps
 // another new function that essentially curries each metric for a given
 // monitor name would do. This could be run when validating monitors and
 // initializing alert templates.
 // MinitorMetrics contains all counters and metrics that Minitor will need to access
 type MinitorMetrics struct {
 	alertCount    *prometheus.CounterVec
 	checkCount    *prometheus.CounterVec
 	monitorStatus *prometheus.GaugeVec
 }
 // NewMetrics creates and initializes all metrics
 func NewMetrics() *MinitorMetrics {
 	// Initialize all metrics
 	metrics := &MinitorMetrics{
 		alertCount: prometheus.NewCounterVec(
 			prometheus.CounterOpts{
 				Name: "minitor_alert_total",
 				Help: "Number of Minitor alerts",
 			},
 			[]string{"alert", "monitor"},
 		),
 		checkCount: prometheus.NewCounterVec(
 			prometheus.CounterOpts{
 				Name: "minitor_check_total",
 				Help: "Number of Minitor checks",
 			},
 			[]string{"monitor", "status", "is_alert"},
 		),
 		monitorStatus: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
 				Name: "minitor_monitor_up_count",
 				Help: "Status of currently responsive monitors",
 			},
 			[]string{"monitor"},
 		),
 	}
 	// Register newly created metrics
 	prometheus.MustRegister(metrics.alertCount)
 	prometheus.MustRegister(metrics.checkCount)
 	prometheus.MustRegister(metrics.monitorStatus)
 	return metrics
 }
 // SetMonitorStatus sets the current status of Monitor
 func (metrics *MinitorMetrics) SetMonitorStatus(monitor string, isUp bool) {
 	val := 0.0
 	if isUp {
 		val = 1.0
 	}
 	metrics.monitorStatus.With(prometheus.Labels{"monitor": monitor}).Set(val)
 }
 // CountCheck counts the result of a particular Monitor check
 func (metrics *MinitorMetrics) CountCheck(monitor string, isSuccess bool, isAlert bool) {
 	status := "failure"
 	if isSuccess {
 		status = "success"
 	}
 	alertVal := "false"
 	if isAlert {
 		alertVal = "true"
 	}
 	metrics.checkCount.With(
 		prometheus.Labels{"monitor": monitor, "status": status, "is_alert": alertVal},
 	).Inc()
 }
 // CountAlert counts an alert
 func (metrics *MinitorMetrics) CountAlert(monitor string, alert string) {
 	metrics.alertCount.With(
 		prometheus.Labels{
 			"alert":   alert,
 			"monitor": monitor,
 		},
 	).Inc()
 }
 // ServeMetrics starts an http server with a Prometheus metrics handler
 func ServeMetrics() {
 	http.Handle("/metrics", promhttp.Handler())
 	host := fmt.Sprintf(":%d", MetricsPort)
 	_ = http.ListenAndServe(host, nil)
 }
@@ -1,41 +1,29 @@
---
+check_interval: 30
 check_interval: 5
 monitors:
-  - name: Fake Website
+  - name: My Website
    command: [ 'curl', '-s', '-o', '/dev/null', 'https://minitor.mon' ]
-    alert_down: [log_down, mailgun_down, sms_down]
+    alert_down: [ log, mailgun_down, sms_down ]
-    alert_up: [log_up, email_up]
+    alert_up: [ log, email_up ]
-    check_interval: 10  # Must be at minimum the global `check_interval`
+    check_interval: 30 # Must be at minimum the global `check_interval`
    alert_after: 3
    alert_every: -1 # Defaults to -1 for exponential backoff. 0 to disable repeating
  - name: Real Website
    command: ['curl', '-s', '-o', '/dev/null', 'https://google.com']
    alert_down: [log_down, mailgun_down, sms_down]
    alert_up: [log_up, email_up]
    check_interval: 5
    alert_after: 3
    alert_every: -1
 alerts:
  log_down:
    command: ["echo", "Minitor failure for {{.MonitorName}}"]
  log_up:
    command: ["echo", "Minitor recovery for {{.MonitorName}}"]
  email_up:
    command: [ sendmail, "me@minitor.mon",  "Recovered: {monitor_name}",  "We're back!" ]
  mailgun_down:
-    command_shell: >
+    command: >
      curl -s -X POST
-      -F subject="Alert! {{.MonitorName}} failed"
+      -F subject="Alert! {monitor_name} failed"
      -F from="Minitor <minitor@minitor.mon>"
      -F to=me@minitor.mon
      -F text="Our monitor failed"
      https://api.mailgun.net/v3/minitor.mon/messages
      -u "api:${MAILGUN_API_KEY}"
  sms_down:
-    command_shell: >
+    command: >
-      curl -s -X POST -F "Body=Failure! {{.MonitorName}} has failed"
+      curl -s -X POST -F "Body=Failure! {monitor_name} has failed"
      -F "From=${AVAILABLE_NUMBER}" -F "To=${MY_PHONE}"
      "https://api.twilio.com/2010-04-01/Accounts/${ACCOUNT_SID}/Messages"
      -u "${ACCOUNT_SID}:${AUTH_TOKEN}"
@@ -1,5 +0,0 @@
 # Minitor Scripts
 A collection of some handy scripts to use with Minitor
 These are not included with the Python package, but they are included in the Docker image in `/app/scripts`.
@@ -1,51 +0,0 @@
 #! /bin/bash
 set -e
 #################
 # docker_check.sh
 #
 # Checks the most recent state exit code of a Docker container
 #################
 # Docker host will default to a socket
 # To override, export DOCKER_HOST to a new hostname
 DOCKER_HOST="${DOCKER_HOST:=socket}"
 container_name="$1"
 # Curls Docker either using a socket or URL
 function curl_docker {
    local path="$1"
    if [ "$DOCKER_HOST" == "socket" ]; then
        curl --unix-socket /var/run/docker.sock "http://localhost/$path" 2>/dev/null
    else
        curl "http://${DOCKER_HOST}/$path" 2>/dev/null
    fi
 }
 # Returns caintainer ID for a given container name
 function get_container_id {
    local container_name="$1"
    curl_docker 'containers/json?all=1' \
        | jq -r ".[] | {Id, Name: .Names[]} | select(.Name == \"/${container_name}\") | .Id"
 }
 # Returns container JSON
 function inspect_container {
    local container_id=$1
    curl_docker "containers/$container_id/json"
 }
 if [ -z "$container_name" ]; then
    echo "Usage: $0 container_name"
    echo "Will exit with the last status code of continer with provided name"
    exit 1
 fi
 container_id=$(get_container_id $container_name)
 if [ -z "$container_id" ]; then
    echo "ERROR: Could not find container with name: $container_name"
    exit 1
 fi
 exit_code=$(inspect_container "$container_id" | jq -r .State.ExitCode)
 exit "$exit_code"
@@ -1,61 +0,0 @@
 #! /bin/bash
 set -e
 #################
 # docker_healthcheck.sh
 #
 # Returns the results of a Docker Healthcheck for a container
 #################
 # Docker host will default to a socket
 # To override, export DOCKER_HOST to a new hostname
 DOCKER_HOST="${DOCKER_HOST:=socket}"
 container_name="$1"
 # Curls Docker either using a socket or URL
 function curl_docker {
    local path="$1"
    if [ "$DOCKER_HOST" == "socket" ]; then
        curl --unix-socket /var/run/docker.sock "http://localhost/$path" 2>/dev/null
    else
        curl "http://${DOCKER_HOST}/$path" 2>/dev/null
    fi
 }
 # Returns caintainer ID for a given container name
 function get_container_id {
    local container_name="$1"
    curl_docker 'containers/json?all=1' \
        | jq -r ".[] | {Id, Name: .Names[]} | select(.Name == \"/${container_name}\") | .Id"
 }
 # Returns container JSON
 function inspect_container {
    local container_id="$1"
    curl_docker "containers/$container_id/json"
 }
 if [ -z "$container_name" ]; then
    echo "Usage: $0 container_name"
    echo "Will return results of healthcheck for continer with provided name"
    exit 1
 fi
 container_id=$(get_container_id "$container_name")
 if [ -z "$container_id" ]; then
    echo "ERROR: Could not find container with name: $container_name"
    exit 1
 fi
 health=$(inspect_container "$container_id" | jq -r '.State.Health.Status')
 case "$health" in
    null)
        echo "No healthcheck results"
        ;;
    starting|healthy)
        echo "Status: '$health'"
        ;;
    *)
        echo "Status: '$health'"
        exit 1
 esac
@@ -6,3 +6,4 @@ monitors:
    alert_down: [ 'alert_down', 'log_shell', 'log_command' ]
    # alert_every: -1
    alert_every: 0