Add Prometheus and Grafana services with alerting configuration
This commit is contained in:
17
alertmanager.yml
Normal file
17
alertmanager.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
global:
|
||||
smtp_smarthost: 'smtp.example.com:587'
|
||||
smtp_from: 'alertmanager@example.com'
|
||||
smtp_auth_username: 'alertmanager'
|
||||
smtp_auth_password: 'dummy_password'
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'email'
|
||||
|
||||
receivers:
|
||||
- name: 'email'
|
||||
email_configs:
|
||||
- to: 'admin@example.com'
|
||||
@@ -14,6 +14,9 @@ volumes:
|
||||
pgadmin_data:
|
||||
authelia_config:
|
||||
authelia_db_data:
|
||||
grafana_data:
|
||||
prometheus_data:
|
||||
alertmanager_data:
|
||||
|
||||
########################
|
||||
# Services
|
||||
@@ -90,6 +93,7 @@ services:
|
||||
- --accesslog.filepath=/var/log/traefik/access.log
|
||||
- --accesslog.bufferingsize=100
|
||||
- --log.level=INFO
|
||||
- --metrics.prometheus=true
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- traefik_letsencrypt:/letsencrypt
|
||||
@@ -261,3 +265,73 @@ services:
|
||||
- traefik.http.routers.pgadmin.entrypoints=websecure
|
||||
- traefik.http.routers.pgadmin.tls.certresolver=le
|
||||
- traefik.http.services.pgadmin.loadbalancer.server.port=80
|
||||
|
||||
## ─────────────────────────────────────────────
|
||||
## Prometheus — monitoring
|
||||
## ─────────────────────────────────────────────
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
networks: [traefik_proxy]
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./rules.yml:/etc/prometheus/rules.yml
|
||||
- prometheus_data:/prometheus
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--storage.tsdb.retention.time=200h'
|
||||
- '--web.enable-lifecycle'
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.prometheus.rule=Host(`prometheus.gate.${DOMAIN}`)
|
||||
- traefik.http.routers.prometheus.entrypoints=websecure
|
||||
- traefik.http.routers.prometheus.tls.certresolver=le
|
||||
- traefik.http.routers.prometheus.middlewares=authelia@docker,security-headers
|
||||
- traefik.http.services.prometheus.loadbalancer.server.port=9090
|
||||
|
||||
## ─────────────────────────────────────────────
|
||||
## Grafana — visualization
|
||||
## ─────────────────────────────────────────────
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
networks: [traefik_proxy]
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.grafana.rule=Host(`grafana.gate.${DOMAIN}`)
|
||||
- traefik.http.routers.grafana.entrypoints=websecure
|
||||
- traefik.http.routers.grafana.tls.certresolver=le
|
||||
- traefik.http.routers.grafana.middlewares=authelia@docker,security-headers
|
||||
- traefik.http.services.grafana.loadbalancer.server.port=3000
|
||||
|
||||
## ─────────────────────────────────────────────
|
||||
## Alertmanager — alert handling
|
||||
## ─────────────────────────────────────────────
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
networks: [traefik_proxy]
|
||||
volumes:
|
||||
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
|
||||
- alertmanager_data:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.alertmanager.rule=Host(`alertmanager.gate.${DOMAIN}`)
|
||||
- traefik.http.routers.alertmanager.entrypoints=websecure
|
||||
- traefik.http.routers.alertmanager.tls.certresolver=le
|
||||
- traefik.http.routers.alertmanager.middlewares=authelia@docker,security-headers
|
||||
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
|
||||
|
||||
43
prometheus.yml
Normal file
43
prometheus.yml
Normal file
@@ -0,0 +1,43 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- rules.yml
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
metrics_path: /metrics
|
||||
|
||||
- job_name: 'authelia'
|
||||
static_configs:
|
||||
- targets: ['authelia:9091']
|
||||
|
||||
- job_name: 'grafana'
|
||||
static_configs:
|
||||
- targets: ['grafana:3000']
|
||||
|
||||
- job_name: 'docker'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_name]
|
||||
regex: '/(.*)'
|
||||
target_label: container_name
|
||||
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
|
||||
target_label: service
|
||||
- action: keep
|
||||
source_labels: [__meta_docker_container_label_com_docker_compose_service]
|
||||
regex: '.*'
|
||||
36
rules.yml
Normal file
36
rules.yml
Normal file
@@ -0,0 +1,36 @@
|
||||
groups:
|
||||
- name: recording_rules
|
||||
rules:
|
||||
- record: job:up:sum
|
||||
expr: sum(up) by (job)
|
||||
- record: job:up:count
|
||||
expr: count(up) by (job)
|
||||
|
||||
- name: alerting_rules
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
|
||||
- alert: TraefikDown
|
||||
expr: up{job="traefik"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Traefik is down"
|
||||
description: "Traefik has been down for more than 2 minutes."
|
||||
|
||||
- alert: AutheliaDown
|
||||
expr: up{job="authelia"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Authelia is down"
|
||||
description: "Authelia authentication service is unavailable."
|
||||
Reference in New Issue
Block a user