Path: blob/main/operations/observability/mixins/cross-teams/dashboards/gitpod-nodes-overview.json
2500 views
{1"__inputs": [],2"__elements": {},3"__requires": [4{5"type": "grafana",6"id": "grafana",7"name": "Grafana",8"version": "9.1.5"9},10{11"type": "panel",12"id": "graph",13"name": "Graph (old)",14"version": ""15},16{17"type": "panel",18"id": "heatmap",19"name": "Heatmap",20"version": ""21},22{23"type": "datasource",24"id": "prometheus",25"name": "Prometheus",26"version": "1.0.0"27},28{29"type": "panel",30"id": "stat",31"name": "Stat",32"version": ""33},34{35"type": "panel",36"id": "table-old",37"name": "Table (old)",38"version": ""39}40],41"annotations": {42"list": [43{44"builtIn": 1,45"datasource": {46"type": "grafana",47"uid": "-- Grafana --"48},49"enable": true,50"hide": true,51"iconColor": "rgba(0, 211, 255, 1)",52"name": "Annotations & Alerts",53"target": {54"limit": 100,55"matchAny": false,56"tags": [],57"type": "dashboard"58},59"type": "dashboard"60}61]62},63"editable": false,64"fiscalYearStartMonth": 0,65"graphTooltip": 1,66"id": null,67"links": [],68"liveNow": false,69"panels": [70{71"datasource": {72"uid": "$datasource"73},74"description": "Number of nodes in the cluster (nodepool filter applied)",75"fieldConfig": {76"defaults": {77"links": [],78"mappings": [],79"min": 0,80"thresholds": {81"mode": "absolute",82"steps": []83},84"unit": "none"85},86"overrides": []87},88"id": 2,89"links": [],90"options": {91"colorMode": "value",92"graphMode": "area",93"justifyMode": "auto",94"orientation": "auto",95"reduceOptions": {96"calcs": [97"lastNotNull"98],99"fields": "",100"values": false101},102"textMode": "auto"103},104"pluginVersion": "9.1.5",105"repeat": "cluster",106"repeatDirection": "h",107"targets": [108{109"datasource": {110"uid": "$datasource"111},112"expr": "count(kube_node_labels{cluster=\"$cluster\", nodepool=~\"$nodepool\"})",113"format": "time_series",114"intervalFactor": 2,115"legendFormat": "",116"refId": "A"117}118],119"title": "$cluster: # of nodes",120"type": "stat"121},122{123"collapsed": false,124"datasource": {125"type": "datasource",126"uid": "grafana"127},128"gridPos": {129"h": 1,130"w": 24,131"x": 0,132"y": 9133},134"id": 12,135"panels": [],136"targets": [137{138"datasource": {139"type": "datasource",140"uid": "grafana"141},142"refId": "A"143}144],145"title": "CPU",146"type": "row"147},148{149"aliasColors": {},150"bars": false,151"dashLength": 10,152"dashes": false,153"datasource": {154"uid": "$datasource"155},156"description": "Average of normalized load average across all nodes of a cluster. If the values is above 1, it means that the cluster is probably saturated.",157"fill": 1,158"fillGradient": 5,159"gridPos": {160"h": 7,161"w": 24,162"x": 0,163"y": 10164},165"hiddenSeries": false,166"id": 3,167"legend": {168"alignAsTable": false,169"avg": false,170"current": false,171"max": false,172"min": false,173"rightSide": false,174"show": true,175"total": false,176"values": false177},178"lines": true,179"linewidth": 1,180"links": [],181"nullPointMode": "null",182"options": {183"alertThreshold": true184},185"percentage": false,186"pluginVersion": "9.1.5",187"pointradius": 5,188"points": false,189"renderer": "flot",190"seriesOverrides": [],191"spaceLength": 10,192"stack": false,193"steppedLine": false,194"targets": [195{196"datasource": {197"uid": "$datasource"198},199"expr": "avg(\n node_load1{cluster=~\"$cluster\",}\n /\n count without (cpu) (\n count without (mode) (\n node_cpu_seconds_total * on(node) group_left() kube_node_labels{nodepool=~\"$nodepool\"}\n )\n )\n) by (cluster)\n",200"format": "time_series",201"intervalFactor": 2,202"legendFormat": "{{cluster}}",203"refId": "A"204}205],206"thresholds": [],207"timeRegions": [],208"title": "Average normalized load average(1min) per cluster",209"tooltip": {210"shared": true,211"sort": 0,212"value_type": "individual"213},214"type": "graph",215"xaxis": {216"mode": "time",217"show": true,218"values": []219},220"yaxes": [221{222"format": "none",223"logBase": 1,224"min": 0,225"show": true226},227{228"format": "none",229"logBase": 1,230"min": 0,231"show": true232}233],234"yaxis": {235"align": false236}237},238{239"columns": [],240"datasource": {241"uid": "$datasource"242},243"description": "Top 10 nodes with highest normalized load average. Nodes with a high normalized load average do not represent a real problem, it only means that pods should probably not be scheduled to them.\n\nIf you'd like to see more details about resource consumption of a particular node, you can do so by clicking at the node name.\n",244"fontSize": "100%",245"gridPos": {246"h": 7,247"w": 12,248"x": 0,249"y": 17250},251"id": 4,252"links": [],253"showHeader": true,254"sort": {255"col": 0,256"desc": true257},258"styles": [259{260"align": "auto",261"link": true,262"linkTargetBlank": true,263"linkUrl": "d/gitpod-admin-nodes/gitpod-admin-nodes?var-datasource=$datasource&var-cluster=$__cell_1&var-nodepool=$__cell_3&var-node=$__cell",264"pattern": "node"265},266{267"align": "auto",268"pattern": "nodepool",269"type": "hidden"270},271{272"align": "auto",273"pattern": "Time",274"type": "hidden"275},276{277"alias": "Normalized load average",278"align": "auto",279"decimals": 1,280"pattern": "Value",281"type": "number",282"unit": "none"283}284],285"targets": [286{287"datasource": {288"uid": "$datasource"289},290"expr": "sort(\n topk(10,\n sum(\n node_load1{cluster=~\"$cluster\"} * on(node) group_left(nodepool) kube_node_labels{nodepool=~\"$nodepool\"}\n /\n count without (cpu) (\n count without (mode) (\n node_cpu_seconds_total * on(node) group_left(nodepool) kube_node_labels{nodepool=~\"$nodepool\"}\n )\n )\n ) by (node, nodepool, cluster)\n )\n)\n",291"format": "table",292"instant": true,293"intervalFactor": 2,294"legendFormat": "",295"refId": "A"296}297],298"title": "Current nodes with highest normalized load average (1min)",299"transform": "table",300"type": "table-old"301},302{303"cards": {},304"color": {305"cardColor": "#b4ff00",306"colorScale": "sqrt",307"colorScheme": "interpolateGreens",308"exponent": 0.5,309"mode": "spectrum"310},311"dataFormat": "timeseries",312"datasource": {313"uid": "$datasource"314},315"description": "Normalized load average per node, distributed in buckets. If the distribution is above 1, it means that our cluster is probably overbooked.",316"fieldConfig": {317"defaults": {318"custom": {319"hideFrom": {320"legend": false,321"tooltip": false,322"viz": false323},324"scaleDistribution": {325"type": "linear"326}327}328},329"overrides": []330},331"gridPos": {332"h": 7,333"w": 12,334"x": 12,335"y": 17336},337"heatmap": {},338"hideZeroBuckets": true,339"highlightCards": true,340"id": 5,341"legend": {342"show": false343},344"options": {345"calculate": true,346"calculation": {347"yBuckets": {348"mode": "size",349"value": "1"350}351},352"cellGap": 2,353"cellValues": {},354"color": {355"exponent": 0.5,356"fill": "#b4ff00",357"mode": "scheme",358"reverse": false,359"scale": "exponential",360"scheme": "Greens",361"steps": 128362},363"exemplars": {364"color": "rgba(255,0,255,0.7)"365},366"filterValues": {367"le": 1e-9368},369"legend": {370"show": false371},372"rowsFrame": {373"layout": "auto"374},375"showValue": "never",376"tooltip": {377"show": true,378"yHistogram": false379},380"yAxis": {381"axisPlacement": "left",382"min": 0,383"reverse": false,384"unit": "none"385}386},387"pluginVersion": "9.1.5",388"targets": [389{390"datasource": {391"uid": "$datasource"392},393"expr": "sum(\n node_load1{cluster=~\"$cluster\",}\n /\n count without (cpu) (\n count without (mode) (\n node_cpu_seconds_total * on(node) group_left() kube_node_labels{nodepool=~\"$nodepool\"}\n )\n )\n) by (node, cluster)\n",394"format": "table",395"intervalFactor": 2,396"legendFormat": "",397"refId": "A"398}399],400"title": "Normalized Load average(1 min)",401"tooltip": {402"show": true,403"showHistogram": false404},405"type": "heatmap",406"xAxis": {407"show": true408},409"yAxis": {410"format": "none",411"logBase": 1,412"min": 0,413"show": true414},415"yBucketBound": "auto",416"yBucketSize": 1417},418{419"collapsed": false,420"datasource": {421"type": "datasource",422"uid": "grafana"423},424"gridPos": {425"h": 1,426"w": 24,427"x": 0,428"y": 24429},430"id": 13,431"panels": [],432"targets": [433{434"datasource": {435"type": "datasource",436"uid": "grafana"437},438"refId": "A"439}440],441"title": "Memory",442"type": "row"443},444{445"aliasColors": {},446"bars": false,447"dashLength": 10,448"dashes": false,449"datasource": {450"uid": "$datasource"451},452"description": "Average of free memory across all nodes of a cluster.",453"fill": 1,454"fillGradient": 5,455"gridPos": {456"h": 7,457"w": 24,458"x": 0,459"y": 25460},461"id": 6,462"legend": {463"alignAsTable": false,464"avg": false,465"current": false,466"max": false,467"min": false,468"rightSide": false,469"show": true,470"total": false,471"values": false472},473"lines": true,474"linewidth": 1,475"links": [],476"nullPointMode": "null",477"percentage": false,478"pointradius": 5,479"points": false,480"renderer": "flot",481"seriesOverrides": [],482"spaceLength": 10,483"stack": false,484"steppedLine": false,485"targets": [486{487"datasource": {488"uid": "$datasource"489},490"expr": "avg(\n node_memory_MemAvailable_bytes{cluster=~\"$cluster\"}\n *\n on(node, cluster) kube_node_labels{nodepool=~\"$nodepool\"}\n) by (cluster)\n",491"format": "time_series",492"intervalFactor": 2,493"legendFormat": "{{cluster}}",494"refId": "A"495}496],497"thresholds": [],498"title": "Average free memory per cluster",499"tooltip": {500"shared": true,501"sort": 0,502"value_type": "individual"503},504"type": "graph",505"xaxis": {506"mode": "time",507"show": true,508"values": []509},510"yaxes": [511{512"format": "bytes",513"logBase": 1,514"min": 0,515"show": true516},517{518"format": "bytes",519"logBase": 1,520"min": 0,521"show": true522}523]524},525{526"columns": [],527"datasource": {528"uid": "$datasource"529},530"description": "Top 10 nodes with least amount of free memory. Ideally, we should never have a node with free memory equals to 0. Pods will start to get OOM killed.\n\nIf you'd like to see more details about resource consumption of a particular node, you can do so by clicking at the node name.\n",531"gridPos": {532"h": 7,533"w": 12,534"x": 0,535"y": 32536},537"id": 7,538"links": [],539"styles": [540{541"align": "auto",542"link": true,543"linkTargetBlank": true,544"linkUrl": "d/gitpod-admin-nodes/gitpod-admin-nodes?var-datasource=$datasource&var-cluster=$__cell_1&var-nodepool=$__cell_3&var-node=$__cell",545"pattern": "node"546},547{548"align": "auto",549"pattern": "nodepool",550"type": "hidden"551},552{553"align": "auto",554"pattern": "Time",555"type": "hidden"556},557{558"alias": "Free Memory",559"align": "auto",560"decimals": 1,561"pattern": "Value",562"type": "number",563"unit": "bytes"564}565],566"targets": [567{568"datasource": {569"uid": "$datasource"570},571"expr": "sort_desc(\n bottomk(10,\n sum(\n node_memory_MemAvailable_bytes{cluster=~\"$cluster\"}\n *\n on(node, cluster) group_left(nodepool) kube_node_labels{nodepool=~\"$nodepool\"}\n ) by (node, nodepool, cluster)\n )\n)\n",572"format": "table",573"instant": true,574"intervalFactor": 2,575"legendFormat": "",576"refId": "A"577}578],579"title": "Current nodes with least available memory",580"type": "table-old"581},582{583"cards": {},584"color": {585"cardColor": "#b4ff00",586"colorScale": "sqrt",587"colorScheme": "interpolateGreens",588"exponent": 0.5,589"mode": "spectrum"590},591"dataFormat": "timeseries",592"datasource": {593"uid": "$datasource"594},595"description": "Free memory per node, distributed in buckets. Workspaces running on nodes in the lowest buckets are good candidates to get OOMed.",596"gridPos": {597"h": 7,598"w": 12,599"x": 12,600"y": 32601},602"heatmap": {},603"hideZeroBuckets": true,604"highlightCards": true,605"id": 8,606"legend": {607"show": false608},609"targets": [610{611"datasource": {612"uid": "$datasource"613},614"expr": "node_memory_MemAvailable_bytes{cluster=~\"$cluster\"}\n*\non(node, cluster) kube_node_labels{nodepool=~\"$nodepool\"}\n",615"format": "table",616"intervalFactor": 2,617"legendFormat": "",618"refId": "A"619}620],621"title": "Free Memory",622"tooltip": {623"show": true,624"showHistogram": false625},626"type": "heatmap",627"xAxis": {628"show": true629},630"yAxis": {631"format": "bytes",632"logBase": 1,633"min": 0,634"show": true635},636"yBucketBound": "auto"637},638{639"collapsed": false,640"datasource": {641"type": "datasource",642"uid": "grafana"643},644"gridPos": {645"h": 1,646"w": 24,647"x": 0,648"y": 39649},650"id": 14,651"panels": [],652"targets": [653{654"datasource": {655"type": "datasource",656"uid": "grafana"657},658"refId": "A"659}660],661"title": "Disk",662"type": "row"663},664{665"aliasColors": {},666"bars": false,667"dashLength": 10,668"dashes": false,669"datasource": {670"uid": "$datasource"671},672"description": "Average of free disk space in the /dev/sdb across all nodes of a cluster.",673"fill": 1,674"fillGradient": 5,675"gridPos": {676"h": 7,677"w": 24,678"x": 0,679"y": 40680},681"id": 9,682"legend": {683"alignAsTable": false,684"avg": false,685"current": false,686"max": false,687"min": false,688"rightSide": false,689"show": true,690"total": false,691"values": false692},693"lines": true,694"linewidth": 1,695"links": [],696"nullPointMode": "null",697"percentage": false,698"pointradius": 5,699"points": false,700"renderer": "flot",701"seriesOverrides": [],702"spaceLength": 10,703"stack": false,704"steppedLine": false,705"targets": [706{707"datasource": {708"uid": "$datasource"709},710"expr": "avg(\n node_filesystem_avail_bytes{cluster=~\"$cluster\", fstype!=\"shiftfs\", device=\"/dev/sdb\"}\n *\n on(node, cluster) kube_node_labels{nodepool=~\"$nodepool\"}\n) by (device, cluster)\n",711"format": "time_series",712"intervalFactor": 2,713"legendFormat": "{{cluster}}",714"refId": "A"715}716],717"thresholds": [],718"title": "Average free disk per cluster (/dev/sdb)",719"tooltip": {720"shared": true,721"sort": 0,722"value_type": "individual"723},724"type": "graph",725"xaxis": {726"mode": "time",727"show": true,728"values": []729},730"yaxes": [731{732"format": "bytes",733"logBase": 1,734"min": 0,735"show": true736},737{738"format": "bytes",739"logBase": 1,740"min": 0,741"show": true742}743]744},745{746"columns": [],747"datasource": {748"uid": "$datasource"749},750"description": "Top 10 nodes with least amount of free space on the /dev/sdb mountpoint. If the any nodes have less than 20GB, it's time to clean it up. (Don't worry we have an alert for it)\n\nIf you'd like to see more details about resource consumption of a particular node, you can do so by clicking at the node name.\n",751"gridPos": {752"h": 7,753"w": 12,754"x": 0,755"y": 47756},757"id": 10,758"links": [],759"styles": [760{761"align": "auto",762"link": true,763"linkTargetBlank": true,764"linkUrl": "d/gitpod-admin-nodes/gitpod-admin-nodes?var-datasource=$datasource&var-cluster=$__cell_1&var-nodepool=$__cell_3&var-node=$__cell",765"pattern": "node"766},767{768"align": "auto",769"pattern": "nodepool",770"type": "hidden"771},772{773"align": "auto",774"pattern": "Time",775"type": "hidden"776},777{778"alias": "Free disk",779"align": "auto",780"decimals": 1,781"pattern": "Value",782"type": "number",783"unit": "bytes"784}785],786"targets": [787{788"datasource": {789"uid": "$datasource"790},791"expr": "sort_desc(\n bottomk(10,\n sum(\n node_filesystem_avail_bytes{cluster=~\"$cluster\", fstype!=\"shiftfs\", device=\"/dev/sdb\"}\n *\n on(node, cluster) group_left(nodepool) kube_node_labels{nodepool=~\"$nodepool\"}\n ) by (node, nodepool, cluster)\n )\n)\n",792"format": "table",793"instant": true,794"intervalFactor": 2,795"legendFormat": "",796"refId": "A"797}798],799"title": "Current nodes with least available disk",800"type": "table-old"801},802{803"cards": {},804"color": {805"cardColor": "#b4ff00",806"colorScale": "sqrt",807"colorScheme": "interpolateGreens",808"exponent": 0.5,809"mode": "spectrum"810},811"dataFormat": "timeseries",812"datasource": {813"uid": "$datasource"814},815"description": "Free disk space on /dev/sdb per node, distributed in buckets. Workspaces running on nodes in the lowest buckets will probably have issues.",816"gridPos": {817"h": 7,818"w": 12,819"x": 12,820"y": 47821},822"heatmap": {},823"hideZeroBuckets": true,824"highlightCards": true,825"id": 11,826"legend": {827"show": false828},829"targets": [830{831"datasource": {832"uid": "$datasource"833},834"expr": "node_filesystem_avail_bytes{cluster=~\"$cluster\", fstype!=\"shiftfs\", device=\"/dev/sdb\"}\n*\non(node, cluster) kube_node_labels{nodepool=~\"$nodepool\"}\n",835"format": "table",836"intervalFactor": 2,837"legendFormat": "",838"refId": "A"839}840],841"title": "Free disk space",842"tooltip": {843"show": true,844"showHistogram": false845},846"type": "heatmap",847"xAxis": {848"show": true849},850"yAxis": {851"format": "bytes",852"logBase": 1,853"min": 0,854"show": true855},856"yBucketBound": "auto"857}858],859"refresh": "30s",860"schemaVersion": 37,861"style": "dark",862"tags": [863"gitpod-mixin"864],865"templating": {866"list": [867{868"current": {869"selected": false,870"text": "VictoriaMetrics",871"value": "VictoriaMetrics"872},873"hide": 0,874"includeAll": false,875"multi": false,876"name": "datasource",877"options": [],878"query": "prometheus",879"refresh": 1,880"regex": "",881"skipUrlSync": false,882"type": "datasource"883},884{885"current": {},886"datasource": {887"uid": "$datasource"888},889"definition": "",890"hide": 0,891"includeAll": true,892"multi": true,893"name": "cluster",894"options": [],895"query": {896"query": "label_values(up{job=\"node-exporter\"}, cluster)",897"refId": "VictoriaMetrics-cluster-Variable-Query"898},899"refresh": 2,900"regex": "",901"skipUrlSync": false,902"sort": 1,903"tagValuesQuery": "",904"tagsQuery": "",905"type": "query",906"useTags": false907},908{909"current": {},910"datasource": {911"uid": "$datasource"912},913"definition": "",914"hide": 0,915"includeAll": true,916"multi": true,917"name": "nodepool",918"options": [],919"query": {920"query": "label_values(kube_node_labels{cluster=~\"$cluster\"}, nodepool)",921"refId": "VictoriaMetrics-nodepool-Variable-Query"922},923"refresh": 2,924"regex": "",925"skipUrlSync": false,926"sort": 1,927"tagValuesQuery": "",928"tagsQuery": "",929"type": "query",930"useTags": false931}932]933},934"time": {935"from": "now-1h",936"to": "now"937},938"timepicker": {939"refresh_intervals": [940"5s",941"10s",942"30s",943"1m",944"5m",945"15m",946"30m",947"1h",948"2h",949"1d"950],951"time_options": [952"5m",953"15m",954"1h",955"6h",956"12h",957"24h",958"2d",959"7d",960"30d"961]962},963"timezone": "utc",964"title": "Gitpod / Nodes Overview",965"uid": "gitpod-nodes-overview",966"version": 1,967"weekStart": ""968}969970971