Path: blob/main/operations/observability/mixins/workspace/dashboards/node-psi.json
2501 views
{1"annotations": {2"list": [3{4"builtIn": 1,5"datasource": {6"type": "grafana",7"uid": "-- Grafana --"8},9"enable": true,10"hide": true,11"iconColor": "rgba(0, 211, 255, 1)",12"name": "Annotations & Alerts",13"target": {14"limit": 100,15"matchAny": false,16"tags": [],17"type": "dashboard"18},19"type": "dashboard"20}21]22},23"editable": true,24"fiscalYearStartMonth": 0,25"graphTooltip": 1,26"id": 86,27"links": [],28"liveNow": false,29"panels": [30{31"collapsed": false,32"datasource": {33"type": "datasource",34"uid": "grafana"35},36"gridPos": {37"h": 1,38"w": 24,39"x": 0,40"y": 041},42"id": 13,43"panels": [],44"targets": [45{46"datasource": {47"type": "datasource",48"uid": "grafana"49},50"refId": "A"51}52],53"title": "Workspace node's normalized Load Average",54"type": "row"55},56{57"aliasColors": {},58"bars": false,59"dashLength": 10,60"dashes": false,61"datasource": {62"type": "prometheus",63"uid": "$datasource"64},65"description": "Nodes with a high normalized load average do not represent a real problem, it only means that pods should probably not be scheduled to them.\n\nIf you'd like to see more details about resource consumption of a particular node, you can do so by clicking at the node name.\n",66"fill": 1,67"fillGradient": 5,68"gridPos": {69"h": 7,70"w": 12,71"x": 0,72"y": 173},74"hiddenSeries": false,75"id": 6,76"legend": {77"alignAsTable": false,78"avg": false,79"current": false,80"max": false,81"min": false,82"rightSide": false,83"show": true,84"total": false,85"values": false86},87"lines": true,88"linewidth": 1,89"links": [],90"nullPointMode": "null",91"options": {92"alertThreshold": true93},94"percentage": false,95"pluginVersion": "9.5.3",96"pointradius": 5,97"points": false,98"renderer": "flot",99"repeat": "cluster",100"seriesOverrides": [101{102"$$hashKey": "object:1617",103"alias": "Node Max Load Avg",104"color": "#FF0000"105}106],107"spaceLength": 10,108"stack": false,109"steppedLine": false,110"targets": [111{112"datasource": {113"uid": "$datasource"114},115"editorMode": "code",116"expr": "topk(5, sum(nodepool:node_load1:normalized{cluster=~\"$cluster\", nodepool=~\".*workspace.*\", node=~\"$node\"}) by (node))\n",117"format": "time_series",118"intervalFactor": 2,119"legendFormat": "{{node}}",120"range": true,121"refId": "A"122},123{124"datasource": {125"uid": "$datasource"126},127"expr": "1\n",128"format": "time_series",129"intervalFactor": 2,130"legendFormat": "Node Max Load Avg",131"refId": "B"132}133],134"thresholds": [],135"timeRegions": [],136"title": "$cluster: Workspace node's normalized load average",137"tooltip": {138"shared": true,139"sort": 0,140"value_type": "individual"141},142"type": "graph",143"xaxis": {144"mode": "time",145"show": true,146"values": []147},148"yaxes": [149{150"$$hashKey": "object:1630",151"format": "none",152"logBase": 1,153"min": 0,154"show": true155},156{157"$$hashKey": "object:1631",158"format": "none",159"logBase": 1,160"min": 0,161"show": true162}163],164"yaxis": {165"align": false166}167},168{169"collapsed": false,170"gridPos": {171"h": 1,172"w": 24,173"x": 0,174"y": 8175},176"id": 86,177"panels": [],178"title": "Workspace node's cpu pressure stall information",179"type": "row"180},181{182"datasource": {183"type": "prometheus",184"uid": "${datasource}"185},186"fieldConfig": {187"defaults": {188"color": {189"mode": "palette-classic"190},191"custom": {192"axisCenteredZero": false,193"axisColorMode": "text",194"axisLabel": "",195"axisPlacement": "auto",196"barAlignment": 0,197"drawStyle": "line",198"fillOpacity": 0,199"gradientMode": "none",200"hideFrom": {201"legend": false,202"tooltip": false,203"viz": false204},205"lineInterpolation": "linear",206"lineWidth": 1,207"pointSize": 5,208"scaleDistribution": {209"type": "linear"210},211"showPoints": "auto",212"spanNulls": false,213"stacking": {214"group": "A",215"mode": "none"216},217"thresholdsStyle": {218"mode": "off"219}220},221"mappings": [],222"max": 1,223"thresholds": {224"mode": "absolute",225"steps": [226{227"color": "green",228"value": null229},230{231"color": "red",232"value": 80233}234]235}236},237"overrides": []238},239"gridPos": {240"h": 8,241"w": 12,242"x": 0,243"y": 9244},245"id": 49,246"options": {247"legend": {248"calcs": [],249"displayMode": "list",250"placement": "bottom",251"showLegend": true252},253"tooltip": {254"mode": "multi",255"sort": "none"256}257},258"repeat": "cluster",259"repeatDirection": "h",260"targets": [261{262"datasource": {263"type": "prometheus",264"uid": "${datasource}"265},266"editorMode": "code",267"expr": "topk(5, rate(node_pressure_cpu_waiting_seconds_total{cluster=\"$cluster\",node=~\"$node\"}[30s]))",268"legendFormat": "{{node}}",269"range": true,270"refId": "A"271}272],273"title": "$cluster: Workspace node's cpu pressure stall information",274"type": "timeseries"275},276{277"collapsed": false,278"gridPos": {279"h": 1,280"w": 24,281"x": 0,282"y": 17283},284"id": 165,285"panels": [],286"title": "Workspace node's memory pressure stall information",287"type": "row"288},289{290"datasource": {291"type": "prometheus",292"uid": "${datasource}"293},294"fieldConfig": {295"defaults": {296"color": {297"mode": "palette-classic"298},299"custom": {300"axisCenteredZero": false,301"axisColorMode": "text",302"axisLabel": "",303"axisPlacement": "auto",304"barAlignment": 0,305"drawStyle": "line",306"fillOpacity": 0,307"gradientMode": "none",308"hideFrom": {309"legend": false,310"tooltip": false,311"viz": false312},313"lineInterpolation": "linear",314"lineWidth": 1,315"pointSize": 5,316"scaleDistribution": {317"type": "linear"318},319"showPoints": "auto",320"spanNulls": false,321"stacking": {322"group": "A",323"mode": "none"324},325"thresholdsStyle": {326"mode": "off"327}328},329"mappings": [],330"max": 1,331"thresholds": {332"mode": "absolute",333"steps": [334{335"color": "green",336"value": null337},338{339"color": "red",340"value": 80341}342]343}344},345"overrides": []346},347"gridPos": {348"h": 8,349"w": 12,350"x": 0,351"y": 18352},353"id": 133,354"options": {355"legend": {356"calcs": [],357"displayMode": "list",358"placement": "bottom",359"showLegend": true360},361"tooltip": {362"mode": "multi",363"sort": "none"364}365},366"repeat": "cluster",367"repeatDirection": "h",368"targets": [369{370"datasource": {371"type": "prometheus",372"uid": "${datasource}"373},374"editorMode": "code",375"expr": "topk(5, rate(node_pressure_memory_waiting_seconds_total{cluster=\"$cluster\",node=~\"$node\"}[30s]))",376"legendFormat": "{{node}}",377"range": true,378"refId": "A"379}380],381"title": "$cluster: Workspace node's memory pressure stall information",382"type": "timeseries"383},384{385"collapsed": false,386"gridPos": {387"h": 1,388"w": 24,389"x": 0,390"y": 26391},392"id": 224,393"panels": [],394"title": "Workspace node's IO pressure stall information",395"type": "row"396},397{398"datasource": {399"type": "prometheus",400"uid": "${datasource}"401},402"fieldConfig": {403"defaults": {404"color": {405"mode": "palette-classic"406},407"custom": {408"axisCenteredZero": false,409"axisColorMode": "text",410"axisLabel": "",411"axisPlacement": "auto",412"barAlignment": 0,413"drawStyle": "line",414"fillOpacity": 0,415"gradientMode": "none",416"hideFrom": {417"legend": false,418"tooltip": false,419"viz": false420},421"lineInterpolation": "linear",422"lineWidth": 1,423"pointSize": 5,424"scaleDistribution": {425"type": "linear"426},427"showPoints": "auto",428"spanNulls": false,429"stacking": {430"group": "A",431"mode": "none"432},433"thresholdsStyle": {434"mode": "off"435}436},437"mappings": [],438"max": 1,439"thresholds": {440"mode": "absolute",441"steps": [442{443"color": "green",444"value": null445},446{447"color": "red",448"value": 80449}450]451}452},453"overrides": []454},455"gridPos": {456"h": 8,457"w": 12,458"x": 0,459"y": 27460},461"id": 197,462"options": {463"legend": {464"calcs": [],465"displayMode": "list",466"placement": "bottom",467"showLegend": true468},469"tooltip": {470"mode": "multi",471"sort": "none"472}473},474"repeat": "cluster",475"repeatDirection": "h",476"targets": [477{478"datasource": {479"type": "prometheus",480"uid": "${datasource}"481},482"editorMode": "code",483"expr": "topk(5, rate(node_pressure_io_waiting_seconds_total{cluster=\"$cluster\",node=~\"$node\"}[30s]))",484"legendFormat": "{{node}}",485"range": true,486"refId": "A"487}488],489"title": "$cluster: Workspace node's IO pressure stall information",490"type": "timeseries"491}492],493"refresh": "30s",494"schemaVersion": 38,495"style": "dark",496"tags": [],497"templating": {498"list": [499{500"current": {501"selected": false,502"text": "VictoriaMetrics",503"value": "VictoriaMetrics"504},505"hide": 0,506"includeAll": false,507"multi": false,508"name": "datasource",509"options": [],510"query": "prometheus",511"queryValue": "",512"refresh": 1,513"regex": "",514"skipUrlSync": false,515"type": "datasource"516},517{518"current": {519"selected": true,520"text": [521"All"522],523"value": [524"$__all"525]526},527"datasource": {528"uid": "$datasource"529},530"definition": "",531"hide": 0,532"includeAll": true,533"multi": true,534"name": "cluster",535"options": [],536"query": {537"query": "label_values(gitpod_ws_manager_mk2_workspace_phase_total, cluster)",538"refId": "VictoriaMetrics-cluster-Variable-Query"539},540"refresh": 2,541"regex": "",542"skipUrlSync": false,543"sort": 1,544"tagValuesQuery": "",545"tagsQuery": "",546"type": "query",547"useTags": false548},549{550"current": {551"selected": true,552"text": [553"All"554],555"value": [556"$__all"557]558},559"datasource": {560"type": "prometheus",561"uid": "${datasource}"562},563"definition": "label_values(node_pressure_memory_waiting_seconds_total,node)",564"hide": 0,565"includeAll": true,566"multi": true,567"name": "node",568"options": [],569"query": {570"query": "label_values(node_pressure_memory_waiting_seconds_total,node)",571"refId": "PrometheusVariableQueryEditor-VariableQuery"572},573"refresh": 2,574"regex": "",575"skipUrlSync": false,576"sort": 0,577"type": "query"578}579]580},581"time": {582"from": "now-1h",583"to": "now"584},585"timepicker": {586"refresh_intervals": [587"5s",588"10s",589"30s",590"1m",591"5m",592"15m",593"30m",594"1h",595"2h",596"1d"597],598"time_options": [599"5m",600"15m",601"1h",602"6h",603"12h",604"24h",605"2d",606"7d",607"30d"608]609},610"timezone": "utc",611"title": "Node Pressure Stall Information",612"uid": "T7pAXoVVk",613"version": 1,614"weekStart": ""615}616617618