-
Notifications
You must be signed in to change notification settings - Fork 72
Expand file tree
/
Copy pathvalues.yaml
More file actions
331 lines (303 loc) · 11.5 KB
/
values.yaml
File metadata and controls
331 lines (303 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
global:
dryRun: false
image:
tag: "main"
initContainerImage:
repository: docker.io/bitnamilegacy/os-shell
tag: "12-debian-12-r30"
pullPolicy: IfNotPresent
metricsPort: 2112
# Datastore configuration
# By default, datastore is not configured (internal mongodb-config ConfigMap is used).
# Uncomment and configure this section to use an external/managed database service,
# or to switch providers (e.g. from MongoDB to PostgreSQL).
#
# When using an external DB, also set:
# global.mongodbStore.enabled: false (skip deploying internal MongoDB)
#
# datastore:
# # Database provider. Supported values: "mongodb", "postgresql"
# provider: "mongodb"
#
# # --- MongoDB connection URI ---
# # External MongoDB (global.mongodbStore.enabled false): full URI only via Secret
# # key MONGODB_URI and credentialsFromSecret.name (envFrom.secretRef). Do not set uri.
# # In-cluster + global.mongodbStore.enabled true: credential-free URI can be built from
# # connection.* in the ConfigMap unless credentialsFromSecret is set; see docs/external-datastore.md.
# credentialsFromSecret:
# name: ""
#
# # --- Connection metadata ---
# # Individual fields (PostgreSQL; optional MongoDB metadata such as database/collection names).
# # For external MongoDB, connection.database etc. still configure collection env vars;
# # the URI itself always comes from the Secret above.
# connection:
# host: ""
# port: 5432 # 5432 for PostgreSQL, 27017 for MongoDB
# database: "HealthEventsDatabase"
# # PostgreSQL-only fields:
# # username: ""
# # sslmode: "require" # disable | require | verify-ca | verify-full
#
# # --- TLS ---
# tls:
# # Set to false only if the external DB does not support TLS (not recommended).
# enabled: true
# # Name of a Kubernetes Secret containing the CA certificate (key: ca.crt).
# # Required when connecting to an external DB with a private/custom CA.
# # Leave empty to use system CAs (e.g. public cloud services with public CA certs).
# caSecretName: ""
#
# # --- Authentication ---
# auth:
# # Authentication mechanism:
# # scram - username/password (SCRAM-SHA-256). Credentials embedded in uri, or via existingSecret.
# # x509 - client certificate authentication (used by internal MongoDB).
# mechanism: "scram"
# # For scram: name of a Kubernetes Secret containing DB credentials.
# # Secret must have keys: username, password.
# # Not needed when application auth is via MONGODB_URI in credentialsFromSecret only.
# existingSecret: ""
# # For x509: name of a Kubernetes Secret containing the client certificate.
# # Secret must have keys: tls.crt, tls.key, ca.crt.
# clientCertSecretName: ""
# # For x509: Subject DN of the application user certificate.
# # Defaults to the NVIDIA internal DN if not set.
# x509ApplicationUserDN: ""
# # For x509: Subject DN of the dgxcops user certificate.
# # Defaults to the NVIDIA internal DN if not set.
# x509DgxcopsUserDN: ""
#
# # --- Database setup job ---
# # The setup job creates required collections and indexes in the external MongoDB.
# # It runs once on install/upgrade when using an external DB.
# setupJob:
# image:
# repository: ghcr.io/rtsp/docker-mongosh
# tag: "2.5.2"
# pullPolicy: IfNotPresent
# # Optional: name of an existing Kubernetes Secret with admin credentials
# # for the setup job to authenticate. Secret must have keys: username, password.
# # Not needed if MONGODB_URI in credentialsFromSecret has sufficient privileges.
# adminSecret: ""
# Certificate rotation (MongoDB only)
# When enabled, client certificates can be rotated without restarting pods.
# Uses controller-runtime's certwatcher to detect file changes and automatically
# provide updated certificates to new MongoDB connections.
certificateRotationEnabled: false
# Shared metadata path used by metadata-collector and syslog-health-monitor
metadataPath: /var/lib/nvsentinel/gpu_metadata.json
# DCGM (Data Center GPU Manager) configuration
# Used by gpu-health-monitor
dcgm:
# Enable DCGM Kubernetes service integration
enabled: true
service:
# DCGM hostengine service endpoint
endpoint: "nvidia-dcgm.gpu-operator.svc"
# DCGM hostengine service port
port: 5555
# OpenTelemetry tracing configuration
tracing:
enabled: false # Enable/disable tracing for all components
insecure: false # Set to true to disable TLS for the collector connection
# OTel collector's OTLP gRPC endpoint (host:port) for trace ingestion
endpoint: ""
# Audit logging configuration
auditLogging:
enabled: false # Enable/disable audit logging for all components
logRequestBody: false # Log request body in audit logs (may contain sensitive data)
maxSizeMB: 100 # Maximum size of each audit log file in MB
maxBackups: 7 # Maximum number of rotated log files to keep
maxAgeDays: 30 # Maximum age of rotated log files in days
compress: true # Compress rotated log files
nodeSelector: {}
tolerations:
- operator: Exists
affinity: {}
systemNodeSelector: {}
systemNodeTolerations: []
imagePullSecrets: []
gpuHealthMonitor:
enabled: true
healthEventsAnalyzer:
enabled: false
faultQuarantine:
enabled: false
nodeDrainer:
enabled: false
faultRemediation:
enabled: false
janitor:
enabled: false
janitorProvider:
enabled: false
cspHealthMonitor:
enabled: false
syslogHealthMonitor:
enabled: true
labeler:
enabled: true
metadataCollector:
enabled: true
inclusterFileServer:
enabled: false
metricsPort: 9001
cleanupMetricsPort: 9002
mongodbStore:
enabled: false
kubernetesObjectMonitor:
enabled: false
eventExporter:
enabled: false
preflight:
enabled: false
k8sdatastoreCrds:
enabled: false
slurmDrainMonitor:
enabled: false
# Network policy configuration
# The metrics-access network policy restricts ingress to metrics ports only.
# This can block other services (e.g., cert-manager webhook) when deployed
# in the same namespace. Set enabled=false to disable the network policy.
networkPolicy:
enabled: true
platformConnector:
image:
repository: ghcr.io/nvidia/nvsentinel/platform-connectors
pullPolicy: IfNotPresent
tag: ""
resources:
limits:
cpu: 200m
memory: 512Mi
requests:
cpu: 200m
memory: 512Mi
podAnnotations: {}
tolerations: []
logLevel: info
mongodbStore:
enabled: false
clientCertMountPath: "/etc/ssl/mongo-client"
maxRetries: 3
postgresqlStore:
clientCertMountPath: "/etc/ssl/client-certs"
# gRPC sink connector - forwards health events to an external gRPC server
# using the PlatformConnector HealthEventOccurredV1 RPC.
grpcSinkConnector:
# Enable or disable the gRPC sink connector.
enabled: false
# gRPC server address, e.g. "my-service.example.com:50051".
target: ""
# Number of retry attempts (with exponential backoff) before dropping an event.
# Total send attempts = 1 initial + maxRetries retries.
maxRetries: 3
# Optional: path to a projected Kubernetes ServiceAccount token for
# bearer token authentication.
# Leave empty to disable authentication.
tokenPath: ""
k8sConnector:
enabled: true
maxNodeConditionMessageLength: 1024
# Max length of the compacted message prefix (before Recommended Action) per health event
compactedHealthEventMsgLen: 72
qps: 5.0
burst: 10
# Node metadata enrichment configuration
# Health event transformers pipeline
pipeline:
# List of transformers to execute (in order)
- name: MetadataAugmentor
enabled: false
config: /etc/config/metadata.toml
- name: OverrideTransformer
enabled: false
config: /etc/config/overrides.toml
# Transformer-specific configurations
transformers:
# Metadata augmentor - enriches events with node labels and provider info
MetadataAugmentor:
cacheSize: 50
cacheTTLSeconds: 3600
allowedLabels:
- "topology.kubernetes.io/zone"
- "topology.kubernetes.io/region"
- "node.kubernetes.io/instance-type"
- "nvidia.com/cuda.driver-version.major"
- "nvidia.com/cuda.driver-version.minor"
- "nvidia.com/cuda.driver-version.revision"
- "nvidia.com/cuda.driver-version.full"
- "nvidia.com/cuda.runtime-version.major"
- "nvidia.com/cuda.runtime-version.minor"
- "nvidia.com/cuda.runtime-version.full"
- "topology.k8s.aws/capacity-block-id"
- "topology.k8s.aws/network-node-layer-1"
- "topology.k8s.aws/network-node-layer-2"
- "topology.k8s.aws/network-node-layer-3"
- "oci.oraclecloud.com/host.id"
- "oci.oraclecloud.com/host.network_block_id"
- "oci.oraclecloud.com/host.rack_id"
- "oci.oraclecloud.com/host.serial_number"
- "cloud.google.com/gce-topology-block"
- "cloud.google.com/gce-topology-host"
- "cloud.google.com/gce-topology-subblock"
# Topograph-applied topology labels (present when Topograph is deployed in the cluster;
# see https://github.com/NVIDIA/topograph/blob/main/docs/reference/node-labels.md)
- "network.topology.nvidia.com/accelerator"
- "network.topology.nvidia.com/leaf"
- "network.topology.nvidia.com/spine"
- "network.topology.nvidia.com/core"
# Property overrides - uses CEL expressions to override event properties
OverrideTransformer:
rules: []
# Example:
# - name: "suppress-xid-109"
# when: 'event.agent == "syslog-health-monitor" && "109" in event.errorCode'
# override:
# isFatal: false
# recommendedAction: "NONE"
socketPath: "/var/run/nvsentinel.sock"
# Node condition cleanup hook configuration
nodeConditionCleanup:
enabled: false
deprecatedConditions: []
# - "OldConditionType1"
# - "OldConditionType2"
image:
repository: docker.io/bitnamilegacy/kubectl
tag: "1.30.6"
pullPolicy: IfNotPresent
resources:
limits:
cpu: 100m
memory: 128Mi
requests:
cpu: 50m
memory: 64Mi
# PostgreSQL subchart configuration
# Only enabled when using PostgreSQL as the datastore
# See values-postgresql.yaml for a complete PostgreSQL configuration example
postgresql:
enabled: false
# PodMonitor configuration for Prometheus Operator
# Disable this if using standard Prometheus with annotations instead
podMonitor:
enabled: true
interval: 30s # Scrape interval (defaults to 30s to maintain backward compatibility)
# scrapeTimeout: "" # Scrape timeout (defaults to Prometheus default if not specified)
# metricsPath: "/metrics" # Metrics path (defaults to /metrics)
# labels: {} # Additional labels for the PodMonitor resource