Add Prometheus and Grafana services with alerting configuration

This commit is contained in:
elfateh4
2025-12-01 19:18:08 +01:00
parent 47e640b969
commit a924adee27
4 changed files with 170 additions and 0 deletions

17
alertmanager.yml Normal file
View File

@@ -0,0 +1,17 @@
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alertmanager@example.com'
smtp_auth_username: 'alertmanager'
smtp_auth_password: 'dummy_password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'admin@example.com'

View File

@@ -14,6 +14,9 @@ volumes:
pgadmin_data:
authelia_config:
authelia_db_data:
grafana_data:
prometheus_data:
alertmanager_data:
########################
# Services
@@ -90,6 +93,7 @@ services:
- --accesslog.filepath=/var/log/traefik/access.log
- --accesslog.bufferingsize=100
- --log.level=INFO
- --metrics.prometheus=true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- traefik_letsencrypt:/letsencrypt
@@ -261,3 +265,73 @@ services:
- traefik.http.routers.pgadmin.entrypoints=websecure
- traefik.http.routers.pgadmin.tls.certresolver=le
- traefik.http.services.pgadmin.loadbalancer.server.port=80
## ─────────────────────────────────────────────
## Prometheus — monitoring
## ─────────────────────────────────────────────
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
networks: [traefik_proxy]
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules.yml:/etc/prometheus/rules.yml
- prometheus_data:/prometheus
- /var/run/docker.sock:/var/run/docker.sock:ro
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
labels:
- traefik.enable=true
- traefik.http.routers.prometheus.rule=Host(`prometheus.gate.${DOMAIN}`)
- traefik.http.routers.prometheus.entrypoints=websecure
- traefik.http.routers.prometheus.tls.certresolver=le
- traefik.http.routers.prometheus.middlewares=authelia@docker,security-headers
- traefik.http.services.prometheus.loadbalancer.server.port=9090
## ─────────────────────────────────────────────
## Grafana — visualization
## ─────────────────────────────────────────────
grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
networks: [traefik_proxy]
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
labels:
- traefik.enable=true
- traefik.http.routers.grafana.rule=Host(`grafana.gate.${DOMAIN}`)
- traefik.http.routers.grafana.entrypoints=websecure
- traefik.http.routers.grafana.tls.certresolver=le
- traefik.http.routers.grafana.middlewares=authelia@docker,security-headers
- traefik.http.services.grafana.loadbalancer.server.port=3000
## ─────────────────────────────────────────────
## Alertmanager — alert handling
## ─────────────────────────────────────────────
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
networks: [traefik_proxy]
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
labels:
- traefik.enable=true
- traefik.http.routers.alertmanager.rule=Host(`alertmanager.gate.${DOMAIN}`)
- traefik.http.routers.alertmanager.entrypoints=websecure
- traefik.http.routers.alertmanager.tls.certresolver=le
- traefik.http.routers.alertmanager.middlewares=authelia@docker,security-headers
- traefik.http.services.alertmanager.loadbalancer.server.port=9093

43
prometheus.yml Normal file
View File

@@ -0,0 +1,43 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- rules.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'traefik'
static_configs:
- targets: ['traefik:8080']
metrics_path: /metrics
- job_name: 'authelia'
static_configs:
- targets: ['authelia:9091']
- job_name: 'grafana'
static_configs:
- targets: ['grafana:3000']
- job_name: 'docker'
docker_sd_configs:
- host: unix:///var/run/docker.sock
relabel_configs:
- source_labels: [__meta_docker_container_name]
regex: '/(.*)'
target_label: container_name
- source_labels: [__meta_docker_container_label_com_docker_compose_service]
target_label: service
- action: keep
source_labels: [__meta_docker_container_label_com_docker_compose_service]
regex: '.*'

36
rules.yml Normal file
View File

@@ -0,0 +1,36 @@
groups:
- name: recording_rules
rules:
- record: job:up:sum
expr: sum(up) by (job)
- record: job:up:count
expr: count(up) by (job)
- name: alerting_rules
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: TraefikDown
expr: up{job="traefik"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Traefik is down"
description: "Traefik has been down for more than 2 minutes."
- alert: AutheliaDown
expr: up{job="authelia"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Authelia is down"
description: "Authelia authentication service is unavailable."