From a924adee277fbf936d5cf513f8ca7c3683eb21b8 Mon Sep 17 00:00:00 2001 From: elfateh4 Date: Mon, 1 Dec 2025 19:18:08 +0100 Subject: [PATCH] Add Prometheus and Grafana services with alerting configuration --- alertmanager.yml | 17 +++++++++++ docker-compose.yml | 74 ++++++++++++++++++++++++++++++++++++++++++++++ prometheus.yml | 43 +++++++++++++++++++++++++++ rules.yml | 36 ++++++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 alertmanager.yml create mode 100644 prometheus.yml create mode 100644 rules.yml diff --git a/alertmanager.yml b/alertmanager.yml new file mode 100644 index 0000000..5684b39 --- /dev/null +++ b/alertmanager.yml @@ -0,0 +1,17 @@ +global: + smtp_smarthost: 'smtp.example.com:587' + smtp_from: 'alertmanager@example.com' + smtp_auth_username: 'alertmanager' + smtp_auth_password: 'dummy_password' + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'email' + +receivers: + - name: 'email' + email_configs: + - to: 'admin@example.com' \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ca4fbca..3f03f0b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,9 @@ volumes: pgadmin_data: authelia_config: authelia_db_data: + grafana_data: + prometheus_data: + alertmanager_data: ######################## # Services @@ -90,6 +93,7 @@ services: - --accesslog.filepath=/var/log/traefik/access.log - --accesslog.bufferingsize=100 - --log.level=INFO + - --metrics.prometheus=true volumes: - /var/run/docker.sock:/var/run/docker.sock:ro - traefik_letsencrypt:/letsencrypt @@ -261,3 +265,73 @@ services: - traefik.http.routers.pgadmin.entrypoints=websecure - traefik.http.routers.pgadmin.tls.certresolver=le - traefik.http.services.pgadmin.loadbalancer.server.port=80 + + ## ───────────────────────────────────────────── + ## Prometheus — monitoring + ## ───────────────────────────────────────────── + prometheus: + image: prom/prometheus:latest + container_name: prometheus + restart: unless-stopped + networks: [traefik_proxy] + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - ./rules.yml:/etc/prometheus/rules.yml + - prometheus_data:/prometheus + - /var/run/docker.sock:/var/run/docker.sock:ro + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--storage.tsdb.retention.time=200h' + - '--web.enable-lifecycle' + labels: + - traefik.enable=true + - traefik.http.routers.prometheus.rule=Host(`prometheus.gate.${DOMAIN}`) + - traefik.http.routers.prometheus.entrypoints=websecure + - traefik.http.routers.prometheus.tls.certresolver=le + - traefik.http.routers.prometheus.middlewares=authelia@docker,security-headers + - traefik.http.services.prometheus.loadbalancer.server.port=9090 + + ## ───────────────────────────────────────────── + ## Grafana — visualization + ## ───────────────────────────────────────────── + grafana: + image: grafana/grafana:latest + container_name: grafana + restart: unless-stopped + networks: [traefik_proxy] + environment: + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD} + volumes: + - grafana_data:/var/lib/grafana + labels: + - traefik.enable=true + - traefik.http.routers.grafana.rule=Host(`grafana.gate.${DOMAIN}`) + - traefik.http.routers.grafana.entrypoints=websecure + - traefik.http.routers.grafana.tls.certresolver=le + - traefik.http.routers.grafana.middlewares=authelia@docker,security-headers + - traefik.http.services.grafana.loadbalancer.server.port=3000 + + ## ───────────────────────────────────────────── + ## Alertmanager — alert handling + ## ───────────────────────────────────────────── + alertmanager: + image: prom/alertmanager:latest + container_name: alertmanager + restart: unless-stopped + networks: [traefik_proxy] + volumes: + - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml + - alertmanager_data:/alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + labels: + - traefik.enable=true + - traefik.http.routers.alertmanager.rule=Host(`alertmanager.gate.${DOMAIN}`) + - traefik.http.routers.alertmanager.entrypoints=websecure + - traefik.http.routers.alertmanager.tls.certresolver=le + - traefik.http.routers.alertmanager.middlewares=authelia@docker,security-headers + - traefik.http.services.alertmanager.loadbalancer.server.port=9093 diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..eb57ec1 --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,43 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - rules.yml + +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'traefik' + static_configs: + - targets: ['traefik:8080'] + metrics_path: /metrics + + - job_name: 'authelia' + static_configs: + - targets: ['authelia:9091'] + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] + + - job_name: 'docker' + docker_sd_configs: + - host: unix:///var/run/docker.sock + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(.*)' + target_label: container_name + - source_labels: [__meta_docker_container_label_com_docker_compose_service] + target_label: service + - action: keep + source_labels: [__meta_docker_container_label_com_docker_compose_service] + regex: '.*' \ No newline at end of file diff --git a/rules.yml b/rules.yml new file mode 100644 index 0000000..a96694d --- /dev/null +++ b/rules.yml @@ -0,0 +1,36 @@ +groups: + - name: recording_rules + rules: + - record: job:up:sum + expr: sum(up) by (job) + - record: job:up:count + expr: count(up) by (job) + + - name: alerting_rules + rules: + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + + - alert: TraefikDown + expr: up{job="traefik"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Traefik is down" + description: "Traefik has been down for more than 2 minutes." + + - alert: AutheliaDown + expr: up{job="authelia"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Authelia is down" + description: "Authelia authentication service is unavailable." \ No newline at end of file