aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorartem-trof <artem-trof@yandex-team.com>2023-02-08 22:59:57 +0300
committerartem-trof <artem-trof@yandex-team.com>2023-02-08 22:59:57 +0300
commit10cb6c3a57347bb0119750e987fba84c0d425eba (patch)
tree1f79b060bacc8c346d43fb6d87911068e9809a7b
parent108a09f8e8729a86f9ebe0dd891e5dfc36c66eb6 (diff)
downloadydb-10cb6c3a57347bb0119750e987fba84c0d425eba.tar.gz
PR from branch users/artem-trof/ydbops-6676-setup-monitoring
ydbops-6676: add separate helmfile for grafana dashboards ydbops-6676: move chart contents to right folder ydbops-6676: rename chart ydbops-6676: add grafana-dashboards helm chart ydbops-6676: fix metric relabeling & chart values ydbops-6676: rename release ydbops-6676: disable node selector ydbops-6676: move to ydb-testing ydbops-6676: fix monitoring port & counters ydbops-6676: add helmfile & values for monitoring
-rw-r--r--ydb/deploy/grafana_dashboards/README.md3
-rwxr-xr-xydb/deploy/grafana_dashboards/local_upload_dashboards.sh3
-rw-r--r--ydb/deploy/helm/ydb-prometheus/.helmignore23
-rw-r--r--ydb/deploy/helm/ydb-prometheus/Chart.lock6
-rw-r--r--ydb/deploy/helm/ydb-prometheus/Chart.yaml23
-rw-r--r--ydb/deploy/helm/ydb-prometheus/README.md49
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/actors.json (renamed from ydb/deploy/grafana_dashboards/actors.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/cpu.json (renamed from ydb/deploy/grafana_dashboards/cpu.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/datashard.json (renamed from ydb/deploy/grafana_dashboards/datashard.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/dboverview.json (renamed from ydb/deploy/grafana_dashboards/dboverview.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/dbstatus.json (renamed from ydb/deploy/grafana_dashboards/dbstatus.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/grpc.json (renamed from ydb/deploy/grafana_dashboards/grpc.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/queryengine.json (renamed from ydb/deploy/grafana_dashboards/queryengine.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/dashboards/txproxy.json (renamed from ydb/deploy/grafana_dashboards/txproxy.json)0
-rw-r--r--ydb/deploy/helm/ydb-prometheus/templates/_helpers.tpl119
-rw-r--r--ydb/deploy/helm/ydb-prometheus/templates/configmaps-grafana.yaml20
-rw-r--r--ydb/deploy/helm/ydb-prometheus/templates/secret-prometheus-static-targets.yaml19
-rw-r--r--ydb/deploy/helm/ydb-prometheus/templates/service-monitor-cluster.yaml2
-rw-r--r--ydb/deploy/helm/ydb-prometheus/values.yaml97
19 files changed, 362 insertions, 2 deletions
diff --git a/ydb/deploy/grafana_dashboards/README.md b/ydb/deploy/grafana_dashboards/README.md
new file mode 100644
index 0000000000..03361dcb68
--- /dev/null
+++ b/ydb/deploy/grafana_dashboards/README.md
@@ -0,0 +1,3 @@
+# Grafana dashboards
+
+Grafana dashboards moved [here](../helm/ydb-prometheus/dashboards/)
diff --git a/ydb/deploy/grafana_dashboards/local_upload_dashboards.sh b/ydb/deploy/grafana_dashboards/local_upload_dashboards.sh
index 5b052e2ab2..a904250810 100755
--- a/ydb/deploy/grafana_dashboards/local_upload_dashboards.sh
+++ b/ydb/deploy/grafana_dashboards/local_upload_dashboards.sh
@@ -5,6 +5,5 @@ GRAFANA_API="http://admin:admin@localhost:3000/api"
curl -X POST -H "Content-Type: application/json" ${GRAFANA_API}/folders --data-ascii '{ "uid": "ydb", "title": "YDB" }'
for DASH in cpu dboverview dbstatus actors grpc queryengine txproxy datashard; do
- cat ${DASH}.json | jq '{ folderUid: "ydb", dashboard: . }' | curl -X POST -H "Content-Type: application/json" ${GRAFANA_API}/dashboards/db -d @-
+ cat ../helm/ydb-prometheus/dashboards/${DASH}.json | jq '{ folderUid: "ydb", dashboard: . }' | curl -X POST -H "Content-Type: application/json" ${GRAFANA_API}/dashboards/db -d @-
done
-
diff --git a/ydb/deploy/helm/ydb-prometheus/.helmignore b/ydb/deploy/helm/ydb-prometheus/.helmignore
new file mode 100644
index 0000000000..0e8a0eb36f
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/ydb/deploy/helm/ydb-prometheus/Chart.lock b/ydb/deploy/helm/ydb-prometheus/Chart.lock
new file mode 100644
index 0000000000..580754a450
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/Chart.lock
@@ -0,0 +1,6 @@
+dependencies:
+- name: kube-prometheus-stack
+ repository: https://prometheus-community.github.io/helm-charts
+ version: 44.3.1
+digest: sha256:a47a360d87953897ca25b27a7eb67767773d324b4e5c58f3cd7af4aa9da7a375
+generated: "2023-02-08T12:54:45.749905+03:00"
diff --git a/ydb/deploy/helm/ydb-prometheus/Chart.yaml b/ydb/deploy/helm/ydb-prometheus/Chart.yaml
new file mode 100644
index 0000000000..82e93e473e
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/Chart.yaml
@@ -0,0 +1,23 @@
+apiVersion: v2
+description: Chart with resources for monitoring YDB clusters with Prometheus and Grafana.
+icon: https://raw.githubusercontent.com/ydb-platform/ydb/main/ydb/docs/_assets/logo.svg
+type: application
+maintainers:
+ - name: YDB
+ email: info@ydb.tech
+name: ydb-prometheus
+sources:
+- https://github.com/ydb-platform/ydb
+version: 0.1.0
+appVersion: "0.1.0"
+keywords:
+ - "prometheus"
+ - "kube-prometheus"
+ - "monitoring"
+ - "ydb"
+
+dependencies:
+ - name: kube-prometheus-stack
+ version: 44.3.*
+ repository: https://prometheus-community.github.io/helm-charts
+ condition: kube-prometheus-stack.enabled \ No newline at end of file
diff --git a/ydb/deploy/helm/ydb-prometheus/README.md b/ydb/deploy/helm/ydb-prometheus/README.md
new file mode 100644
index 0000000000..25eccdf80f
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/README.md
@@ -0,0 +1,49 @@
+# YDB Monitoring Prometheus
+
+Chart with resources for monitoring YDB clusters with Prometheus and Grafana.
+
+This chart depends on [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/8a4f9ea1fb0fe32f3169cbfbd9f0fc517f4aaa10/charts/kube-prometheus-stack).
+
+This chart installs following resources:
+
+* Prometheus [additional scrape configs](https://github.com/prometheus-community/helm-charts/blob/8a4f9ea1fb0fe32f3169cbfbd9f0fc517f4aaa10/charts/kube-prometheus-stack/values.yaml#L3031) for external cluster
+* Prometheus service-monitor object for internal cluster
+* Configmaps with Grafana dashboards
+
+## Monitoring External YDB Cluster (bare metal or virtual machines)
+
+1. Set following in values.yaml:
+
+```yaml
+kube-prometheus-stack:
+ prometheus:
+ prometheusSpec:
+ additionalScrapeConfigsSecret:
+ enabled: true
+ name: ydb-prometheus-additional-scrape-configs
+ key: additional-scrape-configs.yaml
+```
+
+2. Secret (ydb-prometheus-additional-scrape-configs) will be generated with chart installation and referenced in prometheus CRD.
+
+3. Set following in values.yaml to cluster monitor:
+
+```yaml
+
+ydb:
+ clusters:
+ - cluster: <cluster-name>
+ type: external
+ ports:
+ static: <static nodes port>
+ dynamic:
+ - <dynamic nodes ports, one per database (tenant)>
+ hosts:
+ - <ydb host>
+```
+
+4. Install chart with `helm`
+
+## Monitoring Internal YDB Cluster (deployed with ydb-operator)
+
+Work in progress
diff --git a/ydb/deploy/grafana_dashboards/actors.json b/ydb/deploy/helm/ydb-prometheus/dashboards/actors.json
index e563e575c9..e563e575c9 100644
--- a/ydb/deploy/grafana_dashboards/actors.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/actors.json
diff --git a/ydb/deploy/grafana_dashboards/cpu.json b/ydb/deploy/helm/ydb-prometheus/dashboards/cpu.json
index 8848c45014..8848c45014 100644
--- a/ydb/deploy/grafana_dashboards/cpu.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/cpu.json
diff --git a/ydb/deploy/grafana_dashboards/datashard.json b/ydb/deploy/helm/ydb-prometheus/dashboards/datashard.json
index 65b9711bc2..65b9711bc2 100644
--- a/ydb/deploy/grafana_dashboards/datashard.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/datashard.json
diff --git a/ydb/deploy/grafana_dashboards/dboverview.json b/ydb/deploy/helm/ydb-prometheus/dashboards/dboverview.json
index 263942394c..263942394c 100644
--- a/ydb/deploy/grafana_dashboards/dboverview.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/dboverview.json
diff --git a/ydb/deploy/grafana_dashboards/dbstatus.json b/ydb/deploy/helm/ydb-prometheus/dashboards/dbstatus.json
index ea420fa151..ea420fa151 100644
--- a/ydb/deploy/grafana_dashboards/dbstatus.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/dbstatus.json
diff --git a/ydb/deploy/grafana_dashboards/grpc.json b/ydb/deploy/helm/ydb-prometheus/dashboards/grpc.json
index 0c9dec4434..0c9dec4434 100644
--- a/ydb/deploy/grafana_dashboards/grpc.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/grpc.json
diff --git a/ydb/deploy/grafana_dashboards/queryengine.json b/ydb/deploy/helm/ydb-prometheus/dashboards/queryengine.json
index ae862b7a04..ae862b7a04 100644
--- a/ydb/deploy/grafana_dashboards/queryengine.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/queryengine.json
diff --git a/ydb/deploy/grafana_dashboards/txproxy.json b/ydb/deploy/helm/ydb-prometheus/dashboards/txproxy.json
index 51d4666a61..51d4666a61 100644
--- a/ydb/deploy/grafana_dashboards/txproxy.json
+++ b/ydb/deploy/helm/ydb-prometheus/dashboards/txproxy.json
diff --git a/ydb/deploy/helm/ydb-prometheus/templates/_helpers.tpl b/ydb/deploy/helm/ydb-prometheus/templates/_helpers.tpl
new file mode 100644
index 0000000000..e2b4a0424c
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/templates/_helpers.tpl
@@ -0,0 +1,119 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "ydb-prometheus.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "ydb-prometheus.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "ydb-prometheus.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "ydb-prometheus.labels" -}}
+helm.sh/chart: {{ include "ydb-prometheus.chart" . }}
+app.kubernetes.io/name: {{ include "ydb-prometheus.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end -}}
+
+{{- define "ydb-prometheus.labels.grafana" -}}
+{{- $context := . -}}
+{{ $context.Values.grafanaDashboards.markerLabel.key }}: {{ $context.Values.grafanaDashboards.markerLabel.value | quote }}
+{{- end -}}
+
+{{- define "ydb-prometheus.externalCluster.config" -}}
+{{- $context := . -}}
+{{- $counters := .Values.ydb.counters -}}
+{{- $clusters := .Values.ydb.clusters -}}
+{{- $result := list -}}
+{{- range $cluster := $clusters }}
+ {{- if eq $cluster.type "external" }}
+
+ {{- range $counter := $counters }}
+ {{- $name := "staticNode" }}
+ {{- $port := $cluster.ports.static }}
+ {{- $type := default "all" $counter.type }}
+ {{- if or (eq $type "static") (eq $type "all") }}
+ {{- $config := include "ydb-prometheus.externalCluster.targetCounter" (tuple $cluster $counter $name $port) | fromYaml }}
+ {{- $result = append $result $config }}
+ {{- end }}
+ {{- end }}
+
+ {{- range $port := $cluster.ports.dynamic }}
+ {{- range $counter := $counters }}
+ {{- $name := printf "dynamicNode/%d" ($port | int) }}
+ {{- $type := default "all" $counter.type }}
+ {{- if or (eq $type "dynamic") (eq $type "all") }}
+ {{- $config := include "ydb-prometheus.externalCluster.targetCounter" (tuple $cluster $counter $name $port) | fromYaml }}
+ {{- $result = append $result $config }}
+ {{- end }}
+ {{- end }}
+ {{- end }}
+
+{{- end }}
+{{- end }}
+
+{{- $result | toYaml }}
+{{- end -}}
+
+{{- define "ydb-prometheus.externalCluster.targetCounter" }}
+{{- $context := . }}
+{{- $cluster := index $context 0 }}
+{{- $counter := index $context 1 }}
+{{- $name := index $context 2 }}
+{{- $port := index $context 3 }}
+{{- $metricsPath := (printf "/counters/counters=%s/prometheus" $counter.counter) }}
+{{- if $counter.metricsPath }}
+{{- $metricsPath = $counter.metricsPath }}
+{{- end }}
+job_name: {{ printf "ydb/%s/%s/counter/%s" $cluster.cluster $name $counter.counter | quote }}
+metrics_path: {{ $metricsPath | quote }}
+relabel_configs:
+- source_labels:
+ - __address__
+ target_label: instance
+ regex: '([^:]+)(:[0-9]+)?'
+ replacement: '${1}'
+metric_relabel_configs:
+- source_labels:
+ - __name__
+ target_label: __name__
+ regex: (.*)
+ replacement: {{ $counter.counter }}_$1
+static_configs:
+- targets:
+ {{- range $host := $cluster.hosts }}
+ {{- printf "- %s:%d" $host ($port | int) | nindent 4 }}
+ {{- end }}
+ labels:
+ project: {{ $cluster.cluster }}
+ counter: {{ $counter.counter | quote }}
+ container: ydb-dynamic
+{{- end }} \ No newline at end of file
diff --git a/ydb/deploy/helm/ydb-prometheus/templates/configmaps-grafana.yaml b/ydb/deploy/helm/ydb-prometheus/templates/configmaps-grafana.yaml
new file mode 100644
index 0000000000..93db46b7d9
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/templates/configmaps-grafana.yaml
@@ -0,0 +1,20 @@
+{{- $namespace := .Release.Namespace -}}
+{{- $files := .Files.Glob "dashboards/*.json" }}
+{{- if and .Values.grafanaDashboards.enabled $files }}
+apiVersion: v1
+kind: ConfigMapList
+items:
+{{- range $path, $fileContents := $files }}
+{{- $dashboardName := regexReplaceAll "(^.*/)(.*)\\.json$" $path "${2}" }}
+- apiVersion: v1
+ kind: ConfigMap
+ metadata:
+ name: {{ printf "%s-%s" (include "ydb-prometheus.fullname" $) $dashboardName | trunc 63 | trimSuffix "-" }}
+ namespace: {{ $namespace }}
+ labels:
+{{- include "ydb-prometheus.labels" $ | nindent 6 }}
+{{- include "ydb-prometheus.labels.grafana" $ | nindent 6 }}
+ data:
+ {{ $dashboardName }}.json: {{ $.Files.Get $path | toJson }}
+{{- end }}
+{{- end }}
diff --git a/ydb/deploy/helm/ydb-prometheus/templates/secret-prometheus-static-targets.yaml b/ydb/deploy/helm/ydb-prometheus/templates/secret-prometheus-static-targets.yaml
new file mode 100644
index 0000000000..9f1081d18c
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/templates/secret-prometheus-static-targets.yaml
@@ -0,0 +1,19 @@
+{{- if .Values.prometheusConfig.enabled -}}
+{{- $namespace := .Release.Namespace -}}
+{{- $name := .Values.prometheusConfig.additionalScrapeConfigs.name -}}
+ {{- if not $name -}}
+ {{- $name = printf "%s-additional-scrape-configs" (include "ydb-prometheus.fullname" $) -}}
+ {{- end -}}
+ {{- $data := include "ydb-prometheus.externalCluster.config" $ -}}
+ {{- if $data -}}
+apiVersion: v1
+kind: Secret
+metadata:
+ name: {{ $name | trunc 63 | trimSuffix "-" }}
+ namespace: {{ $namespace }}
+ labels:
+{{- include "ydb-prometheus.labels" $ | nindent 4 }}
+data:
+ {{ .Values.prometheusConfig.additionalScrapeConfigs.key }}: {{ $data | b64enc | quote }}
+ {{- end -}}
+{{- end -}} \ No newline at end of file
diff --git a/ydb/deploy/helm/ydb-prometheus/templates/service-monitor-cluster.yaml b/ydb/deploy/helm/ydb-prometheus/templates/service-monitor-cluster.yaml
new file mode 100644
index 0000000000..dc805a4865
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/templates/service-monitor-cluster.yaml
@@ -0,0 +1,2 @@
+{{- if .Values.prometheusConfig.enabled -}}
+{{- end -}} \ No newline at end of file
diff --git a/ydb/deploy/helm/ydb-prometheus/values.yaml b/ydb/deploy/helm/ydb-prometheus/values.yaml
new file mode 100644
index 0000000000..f7ee1d284a
--- /dev/null
+++ b/ydb/deploy/helm/ydb-prometheus/values.yaml
@@ -0,0 +1,97 @@
+# Allows to override chart & chart resources name
+nameOverride: ""
+fullnameOverride: ""
+
+# Enable kube-prometheus-stack chart
+kube-prometheus-stack:
+ enabled: true
+
+ # Install prometheus crd object
+ prometheus:
+ enabled: true
+
+ # Enable only for monitoring ydb external clusters
+ # prometheusSpec:
+ # additionalScrapeConfigsSecret:
+ # enabled: true
+ # name: ydb-prometheus-additional-scrape-configs
+ # key: additional-scrape-configs.yaml
+
+# Grafana dashboards
+grafanaDashboards:
+ # Enable dashboards generation
+ enabled: true
+
+ # Label which will be used to mark configmaps with grafana dashboards
+ markerLabel:
+ key: grafana_dashboard
+ value: "1"
+
+# Prometheus server related configuration
+prometheusConfig:
+ # Enable service monitors & scrape configs generation
+ enabled: true
+
+ # Additional scrape configs that referenced in prometheus.spec
+ #
+ additionalScrapeConfigs:
+ # Generated secret key
+ key: additional-scrape-configs.yaml
+ # Generated secret name, if empty chart name will be used
+ name: ""
+
+# YDB Monitoring related configuration
+ydb:
+ # Predefined monitoring counters to scrape
+ counters:
+ # Counter name
+ # counter: <counter name>
+ #
+ # Counter type
+ # If unspecified, will be used for both targets
+ # type: <static|dynamic>
+ #
+ # Metrics path
+ # If unspecified, will be used /counters/counters=%s/prometheus
+ # metricsPath: <url to prometheus metrics>
+ #
+ - counter: auth
+ - counter: compile
+ - counter: dsproxy
+ - counter: dsproxy_percentile
+ - counter: dsproxy_queue
+ - counter: dsproxynode
+ - counter: grpc
+ - counter: interconnect
+ - counter: kqp
+ - counter: netclassifier
+ - counter: pdisks
+ type: static
+ - counter: pqproxy
+ - counter: proxy
+ - counter: quoter_service
+ - counter: tablets
+ - counter: vdisks
+ type: static
+ - counter: utils
+ - counter: ydb
+ metricsPath: /counters/counters=ydb/name_label=name/prometheus
+
+ # Cluster which will be monitored
+ clusters: []
+ # Cluster name
+ # cluster: <cluster name>
+ #
+ # Cluster type
+ # For external clusters, additional scrape config secret will be generated
+ # For internal clusters, service monitor with appropriate configuration will be generated
+ # type: <external|internal>
+ #
+ # External cluster targets, used only if type == external
+ # external:
+ # Monitoring ports for static and dynamic nodes
+ # ports:
+ # static: <static (storage) node port>
+ # dynamic: <dynamic (tenants) node ports>
+ # Monitoring hosts
+ # hosts: <cluster targets>