add cAdvisor and document detailed alert queries in README

Add cAdvisor container to the monitoring stack for container-level metrics. Configure Alloy to scrape cAdvisor. Expand the README Recommended Alerts section with exact PromQL/LogQL queries, thresholds, and Grafana alert rule configuration for all five alerts.
2026-03-22 22:51:22 +01:00
parent c736c23e9a
commit 926766346c
3 changed files with 114 additions and 8 deletions
--- a/monitoring/config.alloy
+++ b/monitoring/config.alloy
@@ -54,6 +54,18 @@ prometheus.scrape "node" {
  scrape_interval = "60s"
 }

+// ============================================================
+// cAdvisor container metrics -> Grafana Cloud Prometheus
+// ============================================================
+
+prometheus.scrape "cadvisor" {
+  targets    = [{"__address__" = "cadvisor:8080"}]
+  forward_to = [prometheus.remote_write.grafana_cloud.receiver]
+
+  scrape_interval = "60s"
+  metrics_path    = "/metrics"
+}
+
 prometheus.remote_write "grafana_cloud" {
  endpoint {
    url = env("GRAFANA_CLOUD_PROMETHEUS_URL")
--- a/monitoring/docker-compose.yml
+++ b/monitoring/docker-compose.yml
@@ -33,6 +33,27 @@ services:
        max-size: "10m"
        max-file: "3"

+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:v0.52.1
+    container_name: cadvisor
+    restart: unless-stopped
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - --docker_only=true
+      - --housekeeping_interval=30s
+      - --disable_metrics=accelerator,cpu_topology,disk,diskIO,hugetlb,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp
+    networks:
+      - monitoring
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+
  alloy:
    image: grafana/alloy:v1.14.1
    container_name: alloy