commit 00e2c83bebb6bb5acb01d9b6f3c9603f9caaab5d Author: kvanbezouw Date: Tue Jun 2 11:46:19 2026 +0200 Import part 1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3fa69c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Large benchmark output logs — reproducible, not versioned +llm-throughput-tests-mindef-metadateren/results/**/benchmark_io.log +*.log diff --git a/grafana/ubiops-sre/README.md b/grafana/ubiops-sre/README.md new file mode 100644 index 0000000..cf6e56c --- /dev/null +++ b/grafana/ubiops-sre/README.md @@ -0,0 +1,36 @@ +# UbiOps Deployments Dashboard + +Grafana dashboard (`dashboard.json`) for monitoring UbiOps deployment pods on Kubernetes — health, resource usage, restarts, and limits. Data comes from Prometheus (`kube-state-metrics` + cAdvisor `container_*` metrics). + +## Variables + +| Variable | Source | Purpose | +|----------|--------|---------| +| `datasource` | Prometheus datasource picker | Select the Prometheus instance | +| `namespace` | `label_values(kube_pod_info, namespace)` | Namespace to scope to | +| `deployment` | `label_values(kube_deployment_metadata_generation{namespace=$namespace}, deployment)` | Deployment to inspect (defaults to all, `.*`) | + +Pods are matched by `pod=~"$deployment.*"`, so a deployment selection covers all of its pods. + +## Rows & panels + +**Overview** — at-a-glance stat tiles: Running / Pending / Failed pods, Restarts (1h), OOMKilled (1h), Waiting containers. + +**Resource Usage** — CPU and memory working-set usage per pod over time. + +**Deployment Status** — desired vs. available replicas, and container restart rate. + +**Resource Limits** — usage vs. limits for CPU and memory (aggregate and per-pod), plus per-pod limits and **% of limit** (green/yellow/red at 70%/90%) to spot pods approaching OOM. + +**Pod Details** — table of every pod with restart count and memory % of limit, sorted by restarts. + +## Usage + +Default time range is the last 1h with 30s auto-refresh. Import into Grafana (schema `dashboard.grafana.app/v2`, built on Grafana v13), then pick a datasource, namespace, and deployment. + +## Key things to watch + +- **OOMKilled (1h)** and **Memory % of Limit** — memory pressure / under-provisioned limits. +- **Restarts** and **Container Restart Rate** — crash loops. +- **Pending / Failed pods** — scheduling or startup problems. +- **Replicas** (desired vs. available) — incomplete rollouts. diff --git a/grafana/ubiops-sre/dashboard.json b/grafana/ubiops-sre/dashboard.json new file mode 100644 index 0000000..8bb830b --- /dev/null +++ b/grafana/ubiops-sre/dashboard.json @@ -0,0 +1,2828 @@ +{ + "apiVersion": "dashboard.grafana.app/v2", + "kind": "Dashboard", + "metadata": { + "name": "ubiops-deployments-v2", + "namespace": "default", + "uid": "lcNkD29gToeEqZyG02ufxUS3KmhAV3Q5V68hyuObpzwX", + "resourceVersion": "1780386365528981", + "generation": 5, + "creationTimestamp": "2026-02-09T14:48:27Z", + "labels": { + "grafana.app/deprecatedInternalID": "4601" + }, + "annotations": { + "grafana.app/createdBy": "user:ffc8gwlm9q2gwb", + "grafana.app/folder": "", + "grafana.app/saved-from-ui": "Grafana v13.0.1 (a100054f)", + "grafana.app/updatedBy": "user:ffc8gwlm9q2gwb", + "grafana.app/updatedTimestamp": "2026-06-02T07:46:05Z" + } + }, + "spec": { + "annotations": [ + { + "kind": "AnnotationQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "grafana", + "version": "v0", + "datasource": { + "name": "-- Grafana --" + }, + "spec": {} + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "builtIn": true + } + } + ], + "cursorSync": "Crosshair", + "description": "Monitor UbiOps deployment pods: status, resource usage, restarts, and limits.", + "editable": true, + "elements": { + "panel-10": { + "kind": "Panel", + "spec": { + "id": 10, + "title": "Memory Usage by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-12": { + "kind": "Panel", + "spec": { + "id": 12, + "title": "Deployment Replicas", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (deployment) (kube_deployment_status_replicas_available{namespace=\"$namespace\", deployment=~\"$deployment\"})", + "legendFormat": "{{ deployment }} available" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (deployment) (kube_deployment_spec_replicas{namespace=\"$namespace\", deployment=~\"$deployment\"})", + "legendFormat": "{{ deployment }} desired" + } + }, + "refId": "B", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-13": { + "kind": "Panel", + "spec": { + "id": 13, + "title": "Container Restart Rate", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod, container) (increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\", pod=~\"$deployment.*\"}[$__rate_interval]))", + "legendFormat": "{{ pod }}/{{ container }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [ + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-15": { + "kind": "Panel", + "spec": { + "id": 15, + "title": "Memory: Usage vs Limits", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"} * on(namespace, pod) group_left max by(namespace, pod) (kube_pod_info{namespace=\"$namespace\", pod=~\"$deployment.*\"}))", + "legendFormat": "usage" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"memory\"})", + "legendFormat": "limit" + } + }, + "refId": "B", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*limit.*" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + } + } + } + } + }, + "panel-16": { + "kind": "Panel", + "spec": { + "id": 16, + "title": "CPU: Usage vs Limits", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"}[$__rate_interval]) * on(namespace, pod) group_left max by(namespace, pod) (kube_pod_info{namespace=\"$namespace\", pod=~\"$deployment.*\"}))", + "legendFormat": "usage" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"cpu\"})", + "legendFormat": "limit" + } + }, + "refId": "B", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*limit.*" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + } + } + } + } + }, + "panel-18": { + "kind": "Panel", + "spec": { + "id": 18, + "title": "Pod Status", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "kube_pod_info{namespace=\"$namespace\", pod=~\"$deployment.*\"}", + "format": "table", + "instant": true + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (kube_pod_container_status_restarts_total{namespace=\"$namespace\", pod=~\"$deployment.*\"})", + "format": "table", + "instant": true + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "100 * sum by (pod) (container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"}) / sum by (pod) (kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"memory\"})", + "format": "table", + "instant": true + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [ + { + "kind": "Transformation", + "group": "seriesToColumns", + "spec": { + "options": { + "byField": "pod" + } + } + }, + { + "kind": "Transformation", + "group": "organize", + "spec": { + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "__name__": true, + "container_id": true, + "created_by_kind": true, + "created_by_name": true, + "host_ip": true, + "host_network": true, + "instance": true, + "job": true, + "namespace": true, + "node": true, + "pod_ip": true, + "priority_class": true, + "uid": true + }, + "renameByName": { + "Value #B": "Restarts", + "Value #C": "Memory %", + "pod": "Pod" + } + } + } + } + ], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "table", + "version": "12.3.2", + "spec": { + "options": { + "cellHeight": "sm", + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Restarts" + } + ] + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "footer": { + "reducers": [] + }, + "inspect": false + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Restarts" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + { + "id": "max", + "value": 10 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory %" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "gauge" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + { + "id": "max", + "value": 100 + } + ] + } + ] + } + } + } + } + }, + "panel-19": { + "kind": "Panel", + "spec": { + "id": 19, + "title": "Memory Usage by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "deployment-061e7070-acdc-4f44-bf34-098776c66ca1-f9ddd06e" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": true, + "viz": true + } + } + ] + } + ] + } + } + } + } + }, + "panel-2": { + "kind": "Panel", + "spec": { + "id": 2, + "title": "Running Pods", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", pod=~\"$deployment.*\", phase=\"Running\"})", + "legendFormat": "Running" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-20": { + "kind": "Panel", + "spec": { + "id": 20, + "title": "CPU Usage by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-21": { + "kind": "Panel", + "spec": { + "id": 21, + "title": "Memory Limits by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"memory\"})", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "bytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "deployment-061e7070-acdc-4f44-bf34-098776c66ca1-f9ddd06e" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": true, + "viz": true + } + } + ] + } + ] + } + } + } + } + }, + "panel-22": { + "kind": "Panel", + "spec": { + "id": 22, + "title": "Memory % of Limit by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "100 * sum by (pod) (container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"}) / sum by (pod) (kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"memory\"})", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 70, + "color": "yellow" + }, + { + "value": 90, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "deployment-061e7070-acdc-4f44-bf34-098776c66ca1-f9ddd06e" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": true, + "viz": true + } + } + ] + } + ] + } + } + } + } + }, + "panel-23": { + "kind": "Panel", + "spec": { + "id": 23, + "title": "CPU Limits by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"cpu\"})", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-24": { + "kind": "Panel", + "spec": { + "id": 24, + "title": "CPU % of Limit by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "100 * sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$deployment.*\", resource=\"cpu\"})", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.2", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 70, + "color": "yellow" + }, + { + "value": 90, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-3": { + "kind": "Panel", + "spec": { + "id": 3, + "title": "Pending Pods", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", pod=~\"$deployment.*\", phase=\"Pending\"}) or vector(0)", + "legendFormat": "Pending" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 1, + "color": "yellow" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-4": { + "kind": "Panel", + "spec": { + "id": 4, + "title": "Failed Pods", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", pod=~\"$deployment.*\", phase=\"Failed\"}) or vector(0)", + "legendFormat": "Failed" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 1, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-5": { + "kind": "Panel", + "spec": { + "id": 5, + "title": "Restarts (1h)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\", pod=~\"$deployment.*\"}[1h]))", + "legendFormat": "Restarts" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 1, + "color": "yellow" + }, + { + "value": 5, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-6": { + "kind": "Panel", + "spec": { + "id": 6, + "title": "OOMKilled (1h)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\", pod=~\"$deployment.*\"}[1h]) * on(namespace, pod, container) group_left kube_pod_container_status_last_terminated_reason{namespace=\"$namespace\", pod=~\"$deployment.*\", reason=\"OOMKilled\"}) or vector(0)", + "legendFormat": "OOMKilled" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 1, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-7": { + "kind": "Panel", + "spec": { + "id": 7, + "title": "Waiting", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum(kube_pod_container_status_waiting{namespace=\"$namespace\", pod=~\"$deployment.*\"}) or vector(0)", + "legendFormat": "Waiting" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "13.0.1", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 1, + "color": "yellow" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-9": { + "kind": "Panel", + "spec": { + "id": 9, + "title": "CPU Usage by Pod", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$deployment.*\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ pod }}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "13.0.1", + "spec": { + "options": { + "annotations": { + "clustering": -1, + "multiLane": false + }, + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 80, + "color": "red" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + } + }, + "layout": { + "kind": "RowsLayout", + "spec": { + "rows": [ + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Overview", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-2" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 4, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-3" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 8, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-4" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-5" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 16, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-6" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 20, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-7" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Resource Usage", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-9" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-10" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Deployment Status", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-12" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-13" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Resource Limits", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-15" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-16" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 8, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-19" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 8, + "y": 8, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-21" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 16, + "y": 8, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-22" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 16, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-20" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 8, + "y": 16, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-23" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 16, + "y": 16, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-24" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Pod Details", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 24, + "height": 10, + "element": { + "kind": "ElementReference", + "name": "panel-18" + } + } + } + ] + } + } + } + } + ] + } + }, + "links": [], + "liveNow": false, + "preload": false, + "tags": [ + "ubiops", + "kubernetes", + "deployments" + ], + "timeSettings": { + "timezone": "browser", + "from": "now-1h", + "to": "now", + "autoRefresh": "30s", + "autoRefreshIntervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "hideTimepicker": false, + "fiscalYearStartMonth": 0 + }, + "title": "UbiOps Deployments Dev", + "variables": [ + { + "kind": "DatasourceVariable", + "spec": { + "name": "datasource", + "pluginId": "prometheus", + "refresh": "onDashboardLoad", + "regex": "", + "current": { + "text": "", + "value": "" + }, + "options": [], + "multi": false, + "includeAll": false, + "label": "Data Source", + "hide": "dontHide", + "skipUrlSync": false, + "allowCustomValue": true + } + }, + { + "kind": "QueryVariable", + "spec": { + "name": "namespace", + "current": { + "text": "", + "value": "" + }, + "label": "Namespace", + "hide": "dontHide", + "refresh": "onDashboardLoad", + "skipUrlSync": false, + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "query": "label_values(kube_pod_info, namespace)", + "refId": "A" + } + }, + "regex": "", + "regexApplyTo": "value", + "sort": "alphabeticalAsc", + "definition": "label_values(kube_pod_info, namespace)", + "options": [], + "multi": false, + "includeAll": false, + "allowCustomValue": true + } + }, + { + "kind": "QueryVariable", + "spec": { + "name": "deployment", + "current": { + "text": "", + "value": "" + }, + "label": "Deployment", + "hide": "dontHide", + "refresh": "onTimeRangeChanged", + "skipUrlSync": false, + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${datasource}" + }, + "spec": { + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$namespace\"}, deployment)", + "refId": "A" + } + }, + "regex": "", + "regexApplyTo": "value", + "sort": "alphabeticalAsc", + "definition": "label_values(kube_deployment_metadata_generation{namespace=\"$namespace\"}, deployment)", + "options": [], + "multi": false, + "includeAll": true, + "allValue": ".*", + "allowCustomValue": true + } + } + ] + } +} \ No newline at end of file diff --git a/grafana/ubiops-sre/image.png b/grafana/ubiops-sre/image.png new file mode 100644 index 0000000..5c00e28 Binary files /dev/null and b/grafana/ubiops-sre/image.png differ diff --git a/grafana/vllm-metrics/README.md b/grafana/vllm-metrics/README.md new file mode 100644 index 0000000..be9fef2 --- /dev/null +++ b/grafana/vllm-metrics/README.md @@ -0,0 +1,37 @@ +# vLLM Performance Dashboard + +Grafana dashboard for monitoring [vLLM](https://github.com/vllm-project/vllm) inference servers running as UbiOps deployments — request throughput, queue depth, KV cache pressure, and token rates. Fed by the `vllm:*` Prometheus metrics that vLLM exposes. + +> **Note:** `dashboard.json` is currently empty (0 bytes) — the export did not save. These docs are reconstructed from `image.png`; re-export the dashboard to capture the panel/query definitions. + +## Variables + +- **Data Source** — Prometheus instance. +- **Namespace** — Kubernetes namespace (e.g. `default`). +- **Deployment** — the vLLM deployment / served model (e.g. `gpt-oss-120b`). + +## Rows & panels + +**Request Stats** +- *Requests Running* — requests currently being decoded. +- *Requests Waiting* — requests queued for a slot. +- *KV Cache Usage* — % of the GPU KV cache block pool in use (saturation → queuing). +- *Request Rate* — incoming requests over time. +- *Tokens Generated/sec* — output token throughput. +- *Request States Over Time* — running vs. waiting (and swapped) requests as a timeseries. +- *KV Cache Usage Over Time* — KV cache utilization trend. + +**Per-Minute Metrics (RPM / ITPM / OTPM)** +- *Requests Per Minute (RPM)*. +- *Input Tokens Per Minute (ITPM)* — prompt token volume. +- *Output Tokens Per Minute (OTPM)* — generated token volume. + +## Key things to watch + +- **KV Cache Usage** near 100% with rising **Requests Waiting** — the server is capacity-bound; scale up or shorten contexts. +- **Tokens Generated/sec** / **OTPM** dropping while RPM holds — degraded decode throughput. +- Sustained **Requests Waiting** — queue backlog and latency. + +## Usage + +Default range in the screenshot is the last 2 days with auto-refresh. Import into Grafana, then select datasource, namespace, and deployment. \ No newline at end of file diff --git a/grafana/vllm-metrics/dashboard.json b/grafana/vllm-metrics/dashboard.json new file mode 100644 index 0000000..88d3a45 --- /dev/null +++ b/grafana/vllm-metrics/dashboard.json @@ -0,0 +1,2618 @@ +{ + "apiVersion": "dashboard.grafana.app/v2", + "kind": "Dashboard", + "metadata": { + "name": "vllm-perf", + "namespace": "default", + "uid": "AaGqVjAfCOd8D9DSWglyypyK6YhbvBOrEUJn5zrryYIX", + "resourceVersion": "1775139425991994", + "generation": 12, + "creationTimestamp": "2026-02-24T11:40:30Z", + "labels": { + "grafana.app/deprecatedInternalID": "4994" + }, + "annotations": { + "grafana.app/createdBy": "user:ffc8gwlm9q2gwb", + "grafana.app/message": "Restored from version 4", + "grafana.app/saved-from-ui": "Grafana v12.4.2 (ebade4c739)", + "grafana.app/updatedBy": "user:ffc8gwlm9q2gwb", + "grafana.app/updatedTimestamp": "2026-04-02T14:17:05Z", + "grafana.app/folder": "" + } + }, + "spec": { + "annotations": [ + { + "kind": "AnnotationQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "grafana", + "version": "v0", + "datasource": { + "name": "-- Grafana --" + }, + "spec": {} + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "builtIn": true, + "legacyOptions": { + "type": "dashboard" + } + } + } + ], + "cursorSync": "Off", + "editable": true, + "elements": { + "panel-10": { + "kind": "Panel", + "spec": { + "id": 10, + "title": "Time to First Token (TTFT)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.50, rate(vllm:time_to_first_token_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P50" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:time_to_first_token_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P95" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.99, rate(vllm:time_to_first_token_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P99" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-11": { + "kind": "Panel", + "spec": { + "id": 11, + "title": "Inter-Token Latency (TPOT)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.50, rate(vllm:inter_token_latency_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P50" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:inter_token_latency_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P95" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.99, rate(vllm:inter_token_latency_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P99" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-12": { + "kind": "Panel", + "spec": { + "id": 12, + "title": "End-to-End Request Latency", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.50, rate(vllm:e2e_request_latency_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P50" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:e2e_request_latency_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P95" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.99, rate(vllm:e2e_request_latency_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "P99" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-13": { + "kind": "Panel", + "spec": { + "id": 13, + "title": "Request Processing Times (P95)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:request_queue_time_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Queue Time P95" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:request_prefill_time_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Prefill Time P95" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:request_decode_time_seconds_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Decode Time P95" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-15": { + "kind": "Panel", + "spec": { + "id": 15, + "title": "Token Throughput", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:prompt_tokens_total{deployments=\"$deployments\"}[5m])", + "legendFormat": "Prompt Tokens/sec" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:generation_tokens_total{deployments=\"$deployments\"}[5m])", + "legendFormat": "Generation Tokens/sec" + } + }, + "refId": "B", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-16": { + "kind": "Panel", + "spec": { + "id": 16, + "title": "Request Token Lengths", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.50, rate(vllm:request_prompt_tokens_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Prompt Length P50" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:request_prompt_tokens_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Prompt Length P95" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.50, rate(vllm:request_generation_tokens_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Generation Length P50" + } + }, + "refId": "C", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "histogram_quantile(0.95, rate(vllm:request_generation_tokens_bucket{deployments=\"$deployments\"}[5m]))", + "legendFormat": "Generation Length P95" + } + }, + "refId": "D", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-18": { + "kind": "Panel", + "spec": { + "id": 18, + "title": "Prefix Cache Activity", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:prefix_cache_queries_total{deployments=\"$deployments\"}[5m])", + "legendFormat": "Queries" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:prefix_cache_hits_total{deployments=\"$deployments\"}[5m])", + "legendFormat": "Hits" + } + }, + "refId": "B", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.3.3", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-19": { + "kind": "Panel", + "spec": { + "id": 19, + "title": "Prefix Cache Hit Rate", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:prefix_cache_hits_total{deployments=\"$deployments\"}[5m]) / rate(vllm:prefix_cache_queries_total{deployments=\"$deployments\"}[5m])" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "gauge", + "version": "12.3.3", + "spec": { + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "red" + }, + { + "value": 0.5, + "color": "yellow" + }, + { + "value": 0.8, + "color": "green" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-2": { + "kind": "Panel", + "spec": { + "id": 2, + "title": "Requests Running", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "vllm:num_requests_running{deployments=\"$deployments\"}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "12.4.0", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 5, + "color": "yellow" + }, + { + "value": 20, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-21": { + "kind": "Panel", + "spec": { + "id": 21, + "title": "Requests Per Minute (RPM)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "editorMode": "code", + "expr": "sum(rate(vllm:request_success_total{deployments=\"$deployments\"}[1m])) * 60", + "legendFormat": "RPM", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqpm", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Requests/min", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-22": { + "kind": "Panel", + "spec": { + "id": 22, + "title": "Input Tokens Per Minute (ITPM)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:prompt_tokens_total{deployments=\"$deployments\"}[1m]) * 60", + "legendFormat": "ITPM" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Input Tokens/min", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-23": { + "kind": "Panel", + "spec": { + "id": 23, + "title": "Output Tokens Per Minute (OTPM)", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:generation_tokens_total{deployments=\"$deployments\"}[1m]) * 60", + "legendFormat": "OTPM" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Output Tokens/min", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-3": { + "kind": "Panel", + "spec": { + "id": 3, + "title": "Requests Waiting", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "vllm:num_requests_waiting{deployments=\"$deployments\"}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "12.4.0", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + }, + { + "value": 10, + "color": "yellow" + }, + { + "value": 50, + "color": "red" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-4": { + "kind": "Panel", + "spec": { + "id": 4, + "title": "KV Cache Usage", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "vllm:kv_cache_usage_perc{deployments=\"$deployments\"}" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "12.4.0", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-5": { + "kind": "Panel", + "spec": { + "id": 5, + "title": "Request Rate", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "editorMode": "code", + "expr": "rate(vllm:request_success_total{deployments=\"$deployments\"}[$__rate_interval])", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-6": { + "kind": "Panel", + "spec": { + "id": 6, + "title": "Tokens Generated/sec", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "rate(vllm:generation_tokens_total{deployments=\"$deployments\"}[5m])" + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "stat", + "version": "12.4.0", + "spec": { + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "fieldConfig": { + "defaults": { + "unit": "ops", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + } + } + } + }, + "panel-7": { + "kind": "Panel", + "spec": { + "id": 7, + "title": "Request States Over Time", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "vllm:num_requests_running{deployments=\"$deployments\"}", + "legendFormat": "Running" + } + }, + "refId": "A", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "vllm:num_requests_waiting{deployments=\"$deployments\"}", + "legendFormat": "Waiting" + } + }, + "refId": "B", + "hidden": false + } + }, + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "expr": "vllm:num_requests_swapped{deployments=\"$deployments\"}", + "legendFormat": "Swapped" + } + }, + "refId": "C", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": {} + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + }, + "panel-8": { + "kind": "Panel", + "spec": { + "id": 8, + "title": "KV Cache Usage Over Time", + "description": "", + "links": [], + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "editorMode": "code", + "expr": "vllm:kv_cache_usage_perc{deployments=\"$deployments\"}", + "legendFormat": "KV Cache Usage", + "range": true + } + }, + "refId": "A", + "hidden": false + } + } + ], + "transformations": [], + "queryOptions": { + "maxDataPoints": 11000, + "interval": "1m" + } + } + }, + "vizConfig": { + "kind": "VizConfig", + "group": "timeseries", + "version": "12.4.0", + "spec": { + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Name", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "value": 0, + "color": "green" + } + ] + }, + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + } + }, + "overrides": [] + } + } + } + } + } + }, + "layout": { + "kind": "RowsLayout", + "spec": { + "rows": [ + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Request Stats", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-2" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 4, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-3" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 8, + "y": 0, + "width": 4, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-4" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 6, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-5" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 18, + "y": 0, + "width": 6, + "height": 4, + "element": { + "kind": "ElementReference", + "name": "panel-6" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 4, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-7" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 4, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-8" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Per-Minute Metrics (RPM/ITPM/OTPM)", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-21" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 8, + "y": 0, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-22" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 16, + "y": 0, + "width": 8, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-23" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Latency Metrics", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-10" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-11" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 8, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-12" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 8, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-13" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Token Metrics", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-15" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-16" + } + } + } + ] + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "title": "Prefix Cache", + "collapse": false, + "layout": { + "kind": "GridLayout", + "spec": { + "items": [ + { + "kind": "GridLayoutItem", + "spec": { + "x": 0, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-18" + } + } + }, + { + "kind": "GridLayoutItem", + "spec": { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "element": { + "kind": "ElementReference", + "name": "panel-19" + } + } + } + ] + } + } + } + } + ] + } + }, + "links": [], + "liveNow": false, + "preload": false, + "tags": [ + "vllm", + "llm", + "inference" + ], + "timeSettings": { + "timezone": "browser", + "from": "now-1h", + "to": "now", + "autoRefresh": "5s", + "autoRefreshIntervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "hideTimepicker": false, + "fiscalYearStartMonth": 0 + }, + "title": "vLLM Performance Dashboard", + "variables": [ + { + "kind": "DatasourceVariable", + "spec": { + "name": "DS_PROMETHEUS", + "pluginId": "prometheus", + "refresh": "onDashboardLoad", + "regex": "", + "current": { + "text": "default", + "value": "default" + }, + "options": [], + "multi": false, + "includeAll": false, + "label": "Data Source", + "hide": "dontHide", + "skipUrlSync": false, + "allowCustomValue": true + } + }, + { + "kind": "QueryVariable", + "spec": { + "name": "deployments", + "current": { + "text": "mistral-medium", + "value": "mistral-medium" + }, + "label": "Deployment", + "hide": "dontHide", + "refresh": "onDashboardLoad", + "skipUrlSync": false, + "query": { + "kind": "DataQuery", + "group": "prometheus", + "version": "v0", + "datasource": { + "name": "${DS_PROMETHEUS}" + }, + "spec": { + "query": "label_values(vllm:num_requests_running, deployments)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + } + }, + "regex": "", + "regexApplyTo": "value", + "sort": "disabled", + "definition": "label_values(vllm:num_requests_running, deployments)", + "options": [], + "multi": false, + "includeAll": false, + "allowCustomValue": true + } + } + ] + } +} \ No newline at end of file