Add Prometheus and Grafana services with alerting configuration
This commit is contained in:
17
alertmanager.yml
Normal file
17
alertmanager.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
global:
|
||||||
|
smtp_smarthost: 'smtp.example.com:587'
|
||||||
|
smtp_from: 'alertmanager@example.com'
|
||||||
|
smtp_auth_username: 'alertmanager'
|
||||||
|
smtp_auth_password: 'dummy_password'
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname']
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 10s
|
||||||
|
repeat_interval: 1h
|
||||||
|
receiver: 'email'
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'email'
|
||||||
|
email_configs:
|
||||||
|
- to: 'admin@example.com'
|
||||||
@@ -14,6 +14,9 @@ volumes:
|
|||||||
pgadmin_data:
|
pgadmin_data:
|
||||||
authelia_config:
|
authelia_config:
|
||||||
authelia_db_data:
|
authelia_db_data:
|
||||||
|
grafana_data:
|
||||||
|
prometheus_data:
|
||||||
|
alertmanager_data:
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# Services
|
# Services
|
||||||
@@ -90,6 +93,7 @@ services:
|
|||||||
- --accesslog.filepath=/var/log/traefik/access.log
|
- --accesslog.filepath=/var/log/traefik/access.log
|
||||||
- --accesslog.bufferingsize=100
|
- --accesslog.bufferingsize=100
|
||||||
- --log.level=INFO
|
- --log.level=INFO
|
||||||
|
- --metrics.prometheus=true
|
||||||
volumes:
|
volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
- traefik_letsencrypt:/letsencrypt
|
- traefik_letsencrypt:/letsencrypt
|
||||||
@@ -261,3 +265,73 @@ services:
|
|||||||
- traefik.http.routers.pgadmin.entrypoints=websecure
|
- traefik.http.routers.pgadmin.entrypoints=websecure
|
||||||
- traefik.http.routers.pgadmin.tls.certresolver=le
|
- traefik.http.routers.pgadmin.tls.certresolver=le
|
||||||
- traefik.http.services.pgadmin.loadbalancer.server.port=80
|
- traefik.http.services.pgadmin.loadbalancer.server.port=80
|
||||||
|
|
||||||
|
## ─────────────────────────────────────────────
|
||||||
|
## Prometheus — monitoring
|
||||||
|
## ─────────────────────────────────────────────
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:latest
|
||||||
|
container_name: prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [traefik_proxy]
|
||||||
|
volumes:
|
||||||
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||||
|
- ./rules.yml:/etc/prometheus/rules.yml
|
||||||
|
- prometheus_data:/prometheus
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||||
|
- '--web.console.templates=/etc/prometheus/consoles'
|
||||||
|
- '--storage.tsdb.retention.time=200h'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
labels:
|
||||||
|
- traefik.enable=true
|
||||||
|
- traefik.http.routers.prometheus.rule=Host(`prometheus.gate.${DOMAIN}`)
|
||||||
|
- traefik.http.routers.prometheus.entrypoints=websecure
|
||||||
|
- traefik.http.routers.prometheus.tls.certresolver=le
|
||||||
|
- traefik.http.routers.prometheus.middlewares=authelia@docker,security-headers
|
||||||
|
- traefik.http.services.prometheus.loadbalancer.server.port=9090
|
||||||
|
|
||||||
|
## ─────────────────────────────────────────────
|
||||||
|
## Grafana — visualization
|
||||||
|
## ─────────────────────────────────────────────
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:latest
|
||||||
|
container_name: grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [traefik_proxy]
|
||||||
|
environment:
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
|
||||||
|
volumes:
|
||||||
|
- grafana_data:/var/lib/grafana
|
||||||
|
labels:
|
||||||
|
- traefik.enable=true
|
||||||
|
- traefik.http.routers.grafana.rule=Host(`grafana.gate.${DOMAIN}`)
|
||||||
|
- traefik.http.routers.grafana.entrypoints=websecure
|
||||||
|
- traefik.http.routers.grafana.tls.certresolver=le
|
||||||
|
- traefik.http.routers.grafana.middlewares=authelia@docker,security-headers
|
||||||
|
- traefik.http.services.grafana.loadbalancer.server.port=3000
|
||||||
|
|
||||||
|
## ─────────────────────────────────────────────
|
||||||
|
## Alertmanager — alert handling
|
||||||
|
## ─────────────────────────────────────────────
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:latest
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
networks: [traefik_proxy]
|
||||||
|
volumes:
|
||||||
|
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
|
||||||
|
- alertmanager_data:/alertmanager
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||||
|
- '--storage.path=/alertmanager'
|
||||||
|
labels:
|
||||||
|
- traefik.enable=true
|
||||||
|
- traefik.http.routers.alertmanager.rule=Host(`alertmanager.gate.${DOMAIN}`)
|
||||||
|
- traefik.http.routers.alertmanager.entrypoints=websecure
|
||||||
|
- traefik.http.routers.alertmanager.tls.certresolver=le
|
||||||
|
- traefik.http.routers.alertmanager.middlewares=authelia@docker,security-headers
|
||||||
|
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
|
||||||
|
|||||||
43
prometheus.yml
Normal file
43
prometheus.yml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- rules.yml
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- alertmanager:9093
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
- job_name: 'traefik'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['traefik:8080']
|
||||||
|
metrics_path: /metrics
|
||||||
|
|
||||||
|
- job_name: 'authelia'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['authelia:9091']
|
||||||
|
|
||||||
|
- job_name: 'grafana'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['grafana:3000']
|
||||||
|
|
||||||
|
- job_name: 'docker'
|
||||||
|
docker_sd_configs:
|
||||||
|
- host: unix:///var/run/docker.sock
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_docker_container_name]
|
||||||
|
regex: '/(.*)'
|
||||||
|
target_label: container_name
|
||||||
|
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
|
||||||
|
target_label: service
|
||||||
|
- action: keep
|
||||||
|
source_labels: [__meta_docker_container_label_com_docker_compose_service]
|
||||||
|
regex: '.*'
|
||||||
36
rules.yml
Normal file
36
rules.yml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
groups:
|
||||||
|
- name: recording_rules
|
||||||
|
rules:
|
||||||
|
- record: job:up:sum
|
||||||
|
expr: sum(up) by (job)
|
||||||
|
- record: job:up:count
|
||||||
|
expr: count(up) by (job)
|
||||||
|
|
||||||
|
- name: alerting_rules
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
- alert: TraefikDown
|
||||||
|
expr: up{job="traefik"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Traefik is down"
|
||||||
|
description: "Traefik has been down for more than 2 minutes."
|
||||||
|
|
||||||
|
- alert: AutheliaDown
|
||||||
|
expr: up{job="authelia"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Authelia is down"
|
||||||
|
description: "Authelia authentication service is unavailable."
|
||||||
Reference in New Issue
Block a user