-
Notifications
You must be signed in to change notification settings - Fork 72
Expand file tree
/
Copy pathvalues-full.yaml
More file actions
1778 lines (1572 loc) · 65.5 KB
/
values-full.yaml
File metadata and controls
1778 lines (1572 loc) · 65.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
# NVSentinel Complete Configuration Reference
#
# This file documents every possible configuration option for NVSentinel.
# All options are set to their default values unless otherwise noted.
#
# NVSentinel is a Kubernetes-native system for detecting, quarantining,
# and remediating node failures in GPU clusters. It consists of:
#
# - Health Monitors: Detect hardware and software failures
# - Fault Quarantine: Cordon and taint nodes with detected faults
# - Node Drainer: Gracefully evict workloads from failing nodes
# - Fault Remediation: Create maintenance resources to trigger node recovery
# - Janitor: Execute node reboots and terminations via cloud provider APIs
#
# For production use, enable the core modules and configure them for your
# specific environment.
################################################################################
################################################################################
# GLOBAL CONFIGURATION
#
# Settings that apply to all NVSentinel components.
# These can be overridden per-module in their individual sections.
################################################################################
global:
# Docker image configuration
image:
# Image tag to use for all NVSentinel containers
# Examples: "main" (latest development), "v1.0.0" (specific release)
# Leave empty to use the appVersion from Chart.yaml
tag: "main"
# Metrics port for Prometheus scraping
# All components expose health and metrics endpoints on this port unless using ctrl-runtime mgmt
# Prometheus should scrape these endpoints to monitor NVSentinel
metricsPort: 2112
# Health Check port for healthz and readyz scraping
# This port is only used when using ctrl-runtime
# components expose health and metrics endpoints on this port
# Prometheus should scrape these endpoints to monitor NVSentinel
healthPort: 9440
# Dry-run mode - when enabled, all actions are logged but not executed
# Useful for:
# - Testing configuration changes safely
# - Validating rule behavior before production use
# - Debugging without affecting the cluster
# Components will log what they WOULD do but won't actually:
# - Cordon or taint nodes (fault-quarantine)
# - Evict or delete pods (node-drainer)
# - Create maintenance resources (fault-remediation)
# - Reboot or terminate nodes (janitor)
dryRun: false
# Image pull secrets for private container registries
# Create these secrets in Kubernetes before installing NVSentinel:
# kubectl create secret docker-registry my-registry-secret \
# --docker-server=myregistry.com \
# --docker-username=myuser \
# --docker-password=mypassword
# Then reference them here:
# imagePullSecrets:
# - name: my-registry-secret
imagePullSecrets: []
# Node selector for all NVSentinel workloads
# Used to constrain pods to nodes with specific labels
# Example - run only on nodes with label "nvsentinel=enabled":
# nodeSelector:
# nvsentinel: "enabled"
# Leave empty to allow scheduling on any node
nodeSelector: {}
# Tolerations for all NVSentinel workloads
# Allows pods to schedule on nodes with matching taints
# Example - tolerate GPU nodes:
# tolerations:
# - key: "nvidia.com/gpu"
# operator: "Exists"
# effect: "NoSchedule"
# Leave empty for no tolerations
tolerations: []
# Affinity rules for all NVSentinel workloads
# Controls pod placement based on node properties or other pods
# Example - spread pods across availability zones:
# affinity:
# podAntiAffinity:
# preferredDuringSchedulingIgnoredDuringExecution:
# - weight: 100
# podAffinityTerm:
# topologyKey: topology.kubernetes.io/zone
# Leave empty for no affinity rules
affinity: {}
# Node selector for system components
# System components include: platform-connectors, fault-quarantine,
# node-drainer, fault-remediation (control plane workloads)
# Example - run system components on control plane nodes:
# systemNodeSelector:
# node-role.kubernetes.io/control-plane: ""
# Leave empty to use same as global nodeSelector
systemNodeSelector: {}
# Tolerations for system components
# Allows system pods to run on tainted nodes (like control plane)
# Example - tolerate control plane taint:
# systemNodeTolerations:
# - key: "node-role.kubernetes.io/control-plane"
# operator: "Exists"
# effect: "NoSchedule"
# Leave empty to use same as global tolerations
systemNodeTolerations: []
################################################################################
# MODULE ENABLE/DISABLE FLAGS
#
# Control which NVSentinel components are deployed.
# For production, typically enable all modules except healthEventsAnalyzer.
################################################################################
# GPU Health Monitor - monitors GPU hardware health via DCGM
# Detects: XID errors, thermal issues, ECC errors, memory errors, NVLink failures
# Requires: NVIDIA GPU Operator with DCGM installed
gpuHealthMonitor:
enabled: true
# Health Events Analyzer - analyzes patterns in health events
# Provides insights and recommendations based on historical data
# Requires: MongoDB store enabled
healthEventsAnalyzer:
enabled: false
# Fault Quarantine Module - cordons/taints nodes with detected faults
# Prevents new workloads from scheduling on failing nodes
# Uses rule-based evaluation to determine when to quarantine
# Requires: MongoDB store enabled
faultQuarantine:
enabled: false
# Node Drainer Module - evicts workloads from quarantined nodes
# Gracefully terminates pods before node maintenance
# Supports multiple eviction strategies per namespace
# Requires: MongoDB store enabled, fault-quarantine enabled
nodeDrainer:
enabled: false
# Fault Remediation Module - creates maintenance resources for node recovery
# Triggers janitor to reboot or terminate failing nodes
# Uses configurable templates for different remediation actions
# Requires: MongoDB store enabled, janitor enabled
faultRemediation:
enabled: false
# Janitor Module - executes node reboots and terminations
# Integrates with cloud provider APIs to perform actual maintenance
# Supports: AWS, GCP, Azure, OCI, kind (for testing)
# Requires: Cloud provider credentials configured
janitor:
enabled: false
# CSP Health Monitor - monitors cloud provider maintenance events
# Detects scheduled maintenance windows from cloud provider APIs
# Supports: GCP (Compute Engine), AWS (EC2)
# Requires: Cloud provider credentials configured
cspHealthMonitor:
enabled: false
# Syslog Health Monitor - analyzes system logs for fault patterns
# Reads journalctl logs to detect kernel errors, driver issues
# Runs as DaemonSet on all nodes
# Requires: Access to host journalctl
syslogHealthMonitor:
enabled: true
# XID sidecar - parses NVIDIA XID errors from dmesg
# Extracts detailed XID error information
xidSideCar:
enabled: false
# Labeler Module - applies labels to nodes based on capabilities
# Detects: Kata Containers support, node features
# Runs as Deployment to label all nodes in the cluster
labeler:
enabled: true
# In-cluster File Server - HTTP server for log uploads
# Used by log-collector jobs to upload diagnostic bundles
# Only needed if fault-remediation log collection is enabled
inclusterFileServer:
enabled: false
# Metrics port for file server
metricsPort: 9001
# Metrics port for cleanup job
cleanupMetricsPort: 9002
# MongoDB Store - database for health events
# Stores all health events from monitors
# Required by: fault-quarantine, node-drainer, fault-remediation
# Uses Bitnami MongoDB chart with TLS enabled
mongodbStore:
enabled: false
# Event Exporter - exports health events to a remote endpoint
# Requires: MongoDB store enabled
eventExporter:
enabled: false
# Preflight - mutating admission webhook for GPU diagnostic init containers
# Does not use MongoDB. Requires cert-manager (or OpenShift service CA), DCGM,
# and labeled namespaces. Subchart values: top-level preflight: key and
# charts/preflight/values.yaml. See docs/configuration/preflight.md
preflight:
enabled: false
################################################################################
# PLATFORM CONNECTOR CONFIGURATION
#
# Central communication hub that receives health events from all monitors.
# Persists events to MongoDB and updates Kubernetes node status.
# This is the core component that connects health monitors to the remediation
# pipeline.
################################################################################
platformConnector:
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/platform-connectors
# Image pull policy - when to pull the image
# IfNotPresent: Only pull if not already present (recommended)
# Always: Always pull the latest image
# Never: Never pull, use local image only
pullPolicy: IfNotPresent
# Image tag override - leave empty to use global.image.tag
tag: ""
# Resource limits and requests
# Limits: Maximum resources the pod can use
# Requests: Resources guaranteed to be available
resources:
limits:
cpu: 200m
memory: 512Mi
requests:
cpu: 200m
memory: 512Mi
# Additional pod annotations
# Example - for Istio service mesh:
# podAnnotations:
# sidecar.istio.io/inject: "false"
podAnnotations: {}
# Tolerations for platform-connector pods
# Used when global.tolerations is not specified
tolerations: []
# Kubernetes connector configuration
# Handles updating Kubernetes node status and conditions
k8sConnector:
# Maximum length of the node condition message
# This is the maximum length of the node condition message that will be stored in the Kubernetes node status
# If the node condition message is longer than this value, it will be truncated
# This is to prevent the node condition message from being too long and causing the Kubernetes node status to be too large
maxNodeConditionMessageLength: 1024
# Maximum length of the compacted message prefix (before Recommended Action) per health event.
# Controls how much of the ErrorCode and entity identifiers (GPU, PCI, etc.) are preserved
# when messages are compacted to fit within maxNodeConditionMessageLength.
compactedHealthEventMsgLen: 72
# Enable Kubernetes status updates
enabled: true
# Rate limiting for Kubernetes API calls
# QPS: Maximum queries per second to API server
qps: 5.0
# Burst: Maximum burst size above QPS
burst: 10
# Health event transformers pipeline
# Transformers process health events before storage and propagation
# They execute in the order specified in the pipeline
pipeline:
# Metadata augmentor - enriches events with node labels and provider info
- name: MetadataAugmentor
enabled: false # Disabled by default
config: /etc/config/metadata.toml
# Property overrides - uses CEL expressions to override event properties
- name: OverrideTransformer
enabled: false # Disabled by default
config: /etc/config/overrides.toml
# Transformer-specific configurations
transformers:
# Metadata augmentor configuration
# Adds node labels and provider information to health events
MetadataAugmentor:
# LRU cache size for node metadata
# Higher values use more memory but reduce API calls
cacheSize: 50
# Cache TTL in seconds (supports duration strings like "1h", "30m")
# How long to cache node metadata before refreshing
cacheTTLSeconds: 3600
# Node labels to include in health event metadata
# Only these labels will be copied from nodes to events
# This allows CEL rules to match on topology, instance type, etc.
allowedLabels:
- "topology.kubernetes.io/zone"
- "topology.kubernetes.io/region"
- "node.kubernetes.io/instance-type"
- "nvidia.com/cuda.driver-version.major"
- "nvidia.com/cuda.driver-version.minor"
- "nvidia.com/cuda.driver-version.revision"
- "nvidia.com/cuda.driver-version.full"
- "nvidia.com/cuda.runtime-version.major"
- "nvidia.com/cuda.runtime-version.minor"
- "nvidia.com/cuda.runtime-version.full"
- "topology.k8s.aws/capacity-block-id"
- "topology.k8s.aws/network-node-layer-1"
- "topology.k8s.aws/network-node-layer-2"
- "topology.k8s.aws/network-node-layer-3"
- "oci.oraclecloud.com/host.id"
- "oci.oraclecloud.com/host.network_block_id"
- "oci.oraclecloud.com/host.rack_id"
- "oci.oraclecloud.com/host.serial_number"
- "cloud.google.com/gce-topology-block"
- "cloud.google.com/gce-topology-host"
- "cloud.google.com/gce-topology-subblock"
# Topograph-applied topology labels (present when Topograph is deployed in the cluster;
# see https://github.com/NVIDIA/topograph/blob/main/docs/reference/node-labels.md)
- "network.topology.nvidia.com/accelerator"
- "network.topology.nvidia.com/leaf"
- "network.topology.nvidia.com/spine"
- "network.topology.nvidia.com/core"
# Property overrides configuration
# Uses CEL expressions to modify health event properties
# Allows operators to suppress errors or change recommended actions
OverrideTransformer:
# List of override rules (evaluated in order, first match wins)
rules: []
# Example rule - suppress XID 109 errors:
# - name: "suppress-xid-109"
# when: 'event.agent == "syslog-health-monitor" && "109" in event.errorCode'
# override:
# isFatal: false
# recommendedAction: "NONE"
#
# Example rule - zone-specific override:
# - name: "zone-specific-override"
# when: 'event.metadata["topology.kubernetes.io/zone"] == "us-west1-a" && event.componentClass == "GPU"'
# override:
# isFatal: false
# recommendedAction: "CONTACT_SUPPORT"
# Unix socket path for inter-process communication
# Health monitors connect to platform-connectors via this socket
# Must be accessible by both monitors and platform-connectors
socketPath: "/var/run/nvsentinel.sock"
################################################################################
# FAULT-QUARANTINE MODULE CONFIGURATION
#
# Watches MongoDB for health events and applies quarantine actions to nodes.
# Quarantine actions include: cordoning (preventing new pods) and tainting
# (repelling pods without tolerations). Also adds labels and annotations
# for tracking quarantine state.
################################################################################
fault-quarantine:
# Number of pod replicas (recommend 1 for consistency)
replicaCount: 1
# Log level - controls verbosity of logging
# Options: debug, info, warn, error
# debug: Very verbose, includes all details
# info: Normal operations and important events
# warn: Warning conditions
# error: Error conditions only
logLevel: info
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/fault-quarantine
pullPolicy: IfNotPresent
tag: ""
# Pod resource limits and requests
resources:
limits:
cpu: "1"
memory: "1Gi"
requests:
cpu: "1"
memory: "1Gi"
# Scheduling configuration
# These override global values if set
nodeSelector: {}
affinity: {}
tolerations: []
podAnnotations: {}
# Label prefix for node labels and annotations
# Used to create labels like: <labelPrefix>cordon-by, <labelPrefix>cordon-reason
# These labels track when and why nodes were quarantined
# Also used for uncordon tracking
labelPrefix: "k8saas.nvidia.com/"
# Circuit breaker prevents quarantining too many nodes at once
# If the threshold is exceeded, new quarantines are blocked
# This prevents cascading failures from taking down the entire cluster
# State is persisted in a ConfigMap and survives restarts
circuitBreaker:
# Enable circuit breaker protection
enabled: true
# Maximum percentage of cluster nodes that can be cordoned
# Example: 50 means if 50% or more nodes are already cordoned,
# no new quarantines will be allowed
# Range: 1-100
percentage: 50
# Cooldown period after circuit breaker trips
# Circuit breaker waits this long before attempting to close again
# Even if node count drops below threshold during cooldown, it stays open
# Examples: "5m", "10m", "1h"
duration: "5m"
# Quarantine rules define when and how to quarantine nodes
# Each rule has:
# - Match conditions: When the rule should trigger
# - Actions: What to do when conditions match (cordon, taint, label)
# Rules are evaluated in order for each health event
ruleSets:
# Example rule 1: Quarantine nodes with fatal GPU errors
# Enable or disable this ruleset; disabled rulesets are skipped entirely
- enabled: true
version: "1"
name: "GPU fatal error ruleset"
# Match conditions - all must be true
match:
# Use 'all' for AND logic (all conditions must match)
# Use 'any' for OR logic (at least one condition must match)
all:
# Check health event properties
- kind: "HealthEvent"
# CEL expression with access to 'event' object
# Available fields: agent, componentClass, isFatal, nodeName, severity, etc.
expression: "event.agent == 'gpu-health-monitor' && event.componentClass == 'GPU' && event.isFatal == true"
# Check node properties
- kind: "Node"
# CEL expression with access to 'node' object
# Available fields: node.metadata.labels, node.metadata.annotations, node.spec, etc.
# This checks if node is explicitly excluded from management
expression: |
!('k8saas.nvidia.com/ManagedByNVSentinel' in node.metadata.labels && node.metadata.labels['k8saas.nvidia.com/ManagedByNVSentinel'] == "false")
# Actions to take when rule matches
# Cordon action - mark node as unschedulable
cordon:
# When true, sets node.spec.unschedulable=true
# Prevents new pods from being scheduled on the node
# Existing pods continue running
shouldCordon: true
# Optional: Taint action - repel pods without matching tolerations
# Uncomment to enable tainting
# taint:
# # Taint key identifier
# key: "nvidia.com/gpu-error"
# # Taint value (additional context)
# value: "fatal"
# # Effect determines pod behavior:
# # NoSchedule: New pods without toleration won't schedule
# # PreferNoSchedule: Scheduler tries to avoid (soft version)
# # NoExecute: Existing pods without toleration are evicted
# effect: "NoSchedule"
# Example rule 2: Quarantine nodes with CSP maintenance events
- enabled: true
version: "1"
name: "CSP health monitor fatal error ruleset"
match:
all:
- kind: "HealthEvent"
expression: "event.agent == 'csp-health-monitor' && event.checkName == 'CSPMaintenance' && event.isFatal == true"
- kind: "Node"
expression: |
!('k8saas.nvidia.com/ManagedByNVSentinel' in node.metadata.labels && node.metadata.labels['k8saas.nvidia.com/ManagedByNVSentinel'] == "false")
cordon:
shouldCordon: true
# Example rule 3: Quarantine nodes with syslog errors
- enabled: true
version: "1"
name: "Syslog fatal error ruleset"
match:
all:
- kind: "HealthEvent"
expression: "event.agent == 'syslog-health-monitor' && event.componentClass == 'GPU' && event.isFatal == true"
- kind: "Node"
expression: |
!('k8saas.nvidia.com/ManagedByNVSentinel' in node.metadata.labels && node.metadata.labels['k8saas.nvidia.com/ManagedByNVSentinel'] == "false")
cordon:
shouldCordon: true
################################################################################
# NODE-DRAINER MODULE CONFIGURATION
#
# Evicts workloads from quarantined nodes to prepare for maintenance.
# Supports different eviction strategies per namespace:
# - Immediate: Evict right away
# - AllowCompletion: Wait for graceful termination
# - DeleteAfterTimeout: Wait for timeout, then force delete
################################################################################
node-drainer:
# Number of pod replicas (recommend 1 for consistency)
replicaCount: 1
# Log level for debugging and monitoring
logLevel: info
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/node-drainer
pullPolicy: IfNotPresent
tag: ""
# Pod resource limits and requests
resources:
limits:
cpu: "200m"
memory: "300Mi"
requests:
cpu: "200m"
memory: "300Mi"
podAnnotations: {}
# Eviction timeout in seconds
# Maximum time to wait for a pod to terminate gracefully
# After this timeout, pod may be force deleted (depends on eviction mode)
# Must be a positive integer
evictionTimeoutInSeconds: "60"
# System namespaces regex pattern
# Pods in these namespaces are skipped during drain
# This protects critical infrastructure pods from eviction
# The pattern matches namespace names
systemNamespaces: "^(nvsentinel|kube-system|gpu-operator|gmp-system|network-operator|skyhook)$"
# Delete timeout in minutes for DeleteAfterTimeout mode
# Calculated from health event creation time
# When timeout is reached, remaining pods are force deleted
# Used to prevent drains from waiting indefinitely
# Default: 60 minutes
deleteAfterTimeoutMinutes: 60
# NotReady timeout in minutes
# Time after which a pod in NotReady state is considered stuck
# Stuck pods are candidates for force deletion
# Helps identify and handle unhealthy pods that won't complete gracefully
# Default: 5 minutes
notReadyTimeoutMinutes: 5
# Namespace-specific eviction strategies
# Define how pods in different namespaces should be evicted
# Multiple rules can be defined with namespace patterns
userNamespaces:
# Match all user namespaces (not matched by systemNamespaces)
- name: "*"
# Eviction mode options:
# "Immediate": Evict pod immediately, don't wait for graceful termination
# Use for: Stateless apps, batch jobs
# "AllowCompletion": Wait for pod to complete gracefully
# Respects pod's terminationGracePeriodSeconds
# Use for: Long-running jobs, stateful apps
# "DeleteAfterTimeout": Wait for deleteAfterTimeoutMinutes
# Then force delete if still running
# Use for: Jobs that should complete but might hang
mode: "AllowCompletion"
# Example: Immediate eviction for a specific namespace
# - name: "batch-jobs"
# mode: "Immediate"
# Example: Delete after timeout for another namespace
# - name: "training-jobs"
# mode: "DeleteAfterTimeout"
# If enabled, the node-drainer will only drain pods which are leveraging the GPU_UUID impacted entity in
# COMPONENT_RESET HealthEvents. If disabled, the node-drainer will drain all eligible pods
# on the impacted node for the configured namespaces regardless of the remediation action.
# HealthEvents with the COMPONENT_RESET remediation action must include an impacted entity for the
# unhealthy GPU_UUID or else the drain will fail. IMPORTANT: If this setting is enabled, the COMPONENT_RESET
# action in fault-remediation must map to a custom resource which takes action only against the GPU_UUID.
# If partial drain was enabled in node-drainer but fault-remediation mapped COMPONENT_RESET to a reboot
# action, pods which weren't drained would be restarted as part of the reboot.
partialDrainEnabled: false
################################################################################
# FAULT-REMEDIATION MODULE CONFIGURATION
#
# Creates maintenance CRDs (Custom Resource Definitions) to trigger node recovery.
# The janitor module watches these CRDs and performs actual cloud provider API
# calls to reboot or terminate nodes.
################################################################################
fault-remediation:
# Number of pod replicas (recommend 1 for consistency)
replicaCount: 1
# Log level for debugging and monitoring
logLevel: info
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/fault-remediation
pullPolicy: IfNotPresent
tag: ""
# Pod resource limits and requests
resources:
limits:
cpu: "200m"
memory: "300Mi"
requests:
cpu: "200m"
memory: "300Mi"
# Scheduling configuration
nodeSelector: {}
affinity: {}
# Special tolerations - allow running on all nodes for log collection
tolerations:
- operator: "Exists"
podAnnotations: {}
# Multi-template remediation configuration
# Allows different remediation actions to use different CRDs and operators
maintenance:
# Per-action remediation definitions
# Key is the RecommendedAction string (e.g., "COMPONENT_RESET", "RESTART_BM")
actions:
COMPONENT_RESET:
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "RebootNode"
scope: "Cluster"
completeConditionType: "NodeReady"
templateFileName: "nvidia-reboot.yaml"
equivalenceGroup: "restart"
# RESTART_VM is used as a fallback when GPU UUID is not available from metadata-collector.
RESTART_VM:
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "RebootNode"
scope: "Cluster"
completeConditionType: "NodeReady"
templateFileName: "nvidia-reboot.yaml"
equivalenceGroup: "restart"
# RESTART_BM is used for bare-metal node restarts.
RESTART_BM:
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "RebootNode"
scope: "Cluster"
completeConditionType: "NodeReady"
templateFileName: "nvidia-reboot.yaml"
equivalenceGroup: "restart"
# REPLACE_VM is used when the VM needs to be terminated and replaced.
REPLACE_VM:
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "TerminateNode"
scope: "Cluster"
completeConditionType: "NodeTerminated"
templateFileName: "terminate-node.yaml"
equivalenceGroup: "terminate"
# NOTE: Resource names for RBAC are generated by appending 's' to lowercase kind.
# This works for regular nouns but may fail for irregular plurals:
# RebootNode → rebootnodes (✓ correct)
# Policy → policys (✗ should be policies)
# Use CRD kinds that follow regular pluralization rules.
# Template content for each remediation action
# Key matches the templateFileName name from actions above
templates:
"nvidia-reboot.yaml": |
apiVersion: janitor.dgxc.nvidia.com/v1alpha1
kind: RebootNode
metadata:
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
labels:
app.kubernetes.io/managed-by: nvsentinel
annotations:
nvsentinel.nvidia.com/ttl: "336h"
spec:
nodeName: {{ .HealthEvent.NodeName }}
force: false
"terminate-node.yaml": |
apiVersion: janitor.dgxc.nvidia.com/v1alpha1
kind: TerminateNode
metadata:
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
labels:
app.kubernetes.io/managed-by: nvsentinel
annotations:
nvsentinel.nvidia.com/ttl: "336h"
spec:
nodeName: {{ .HealthEvent.NodeName }}
force: false
# Additional template examples:
# "namespaced-restart.yaml": |
# apiVersion: remediation.example.com/v1alpha1
# kind: RestartNode
# metadata:
# name: maintenance-{{ .NodeName }}-{{ .HealthEventID }}
# namespace: remediation
# labels:
# app.kubernetes.io/managed-by: nvsentinel
# spec:
# nodeName: {{ .NodeName }}
# ttlSecondsAfterFinished: 60
# Retry configuration for node annotation updates
# After creating maintenance resource, module updates node annotations
# These settings control retry behavior on failures (conflicts, network issues)
updateRetry:
# Maximum retry attempts
# After this many failures, the update is abandoned
maxRetries: 5
# Delay between retries in seconds
# Uses exponential backoff (delay increases each retry)
# First retry: retryDelaySeconds
# Second retry: retryDelaySeconds * 2
# Third retry: retryDelaySeconds * 4, etc.
retryDelaySeconds: 10
# Log collector configuration
# When enabled, creates Kubernetes Jobs to collect diagnostic logs
# from failing nodes before they are rebooted or terminated
logCollector:
# Enable log collection feature
# When true, a log collector job is created for each node failure
# Logs are uploaded to the configured upload URL
enabled: false
# Container image for log collector
image:
repository: ghcr.io/nvidia/nvsentinel/log-collector
pullPolicy: IfNotPresent
# HTTP endpoint where logs will be uploaded
# Should be the incluster-file-server or external storage
# Logs are uploaded as tar.gz files
uploadURL: "http://nvsentinel-incluster-file-server.nvsentinel.svc.cluster.local/upload"
# Namespaces where GPU operator components run
# Log collector will gather logs from these namespaces
# Can be multiple namespaces separated by commas
gpuOperatorNamespaces: "gpu-operator"
# Enable GPU Operator must-gather collection (disabled by default)
# WARNING: must-gather collects logs from ALL nodes in the cluster, which can be very
# time-consuming for large clusters
# If enabling this, you MUST increase the timeout accordingly.
# Recommended timeout: ~2-3 minutes per node
enableGpuOperatorMustGather: false
# Timeout for log collection job (default: 10m)
# If enableGpuOperatorMustGather is true, increase to: ~2-3 min per node in cluster
timeout: "10m"
# Enable GCP-specific SOS report collection
# Requires running on GCP and appropriate permissions
enableGcpSosCollection: false
# Enable AWS-specific SOS report collection
# Requires running on AWS and appropriate permissions
enableAwsSosCollection: false
################################################################################
# GPU-HEALTH-MONITOR CONFIGURATION
#
# Monitors GPU hardware health using NVIDIA DCGM (Data Center GPU Manager).
# Detects: XID errors, thermal throttling, ECC errors, memory errors, NVLink
# failures, and other GPU hardware faults.
#
# Requires: NVIDIA GPU Operator with DCGM deployed in the cluster
################################################################################
gpu-health-monitor:
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/gpu-health-monitor
pullPolicy: IfNotPresent
tag: ""
# DCGM integration configuration
# DCGM must be running in the cluster (typically via GPU Operator)
dcgm:
# Enable DCGM Kubernetes service discovery
# When true, monitor connects to DCGM via Kubernetes service
# When false, must use host networking to connect to local DCGM
dcgmK8sServiceEnabled: true
# DCGM service endpoint
service:
# Service endpoint in format: servicename.namespace.svc
# Default GPU Operator deployment uses: nvidia-dcgm.gpu-operator.svc
endpoint: "nvidia-dcgm.gpu-operator.svc"
# DCGM service port
# Default DCGM port is 5555
port: 5555
# Use host networking for GPU health monitor pods
# Required when:
# - DCGM is running as hostProcess (not as service)
# - Need direct access to host-level GPU metrics
# - dcgmK8sServiceEnabled is false
# Most deployments should use service-based access (false)
useHostNetworking: false
# Additional volume mounts for GPU health monitor
# Use when monitor needs access to host files
# Example:
# additionalVolumeMounts:
# - name: custom-config
# mountPath: /etc/custom
# readOnly: true
additionalVolumeMounts: []
# Additional host volumes for GPU health monitor
# Use with additionalVolumeMounts to mount host paths
# Example:
# additionalHostVolumes:
# - name: custom-config
# hostPath:
# path: /etc/custom
# type: Directory
additionalHostVolumes: []
# Pod resource limits and requests
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 100m
memory: 128Mi
# Scheduling configuration
nodeSelector: {}
affinity: {}
tolerations: []
podAnnotations: {}
################################################################################
# LABELER CONFIGURATION
#
# Labels nodes based on their capabilities and features.
# Currently detects: Kata Containers support
# Runs as Deployment to label all nodes in the cluster.
################################################################################
labeler:
# Number of replicas for the labeler deployment
replicaCount: 1
# Log level for debugging
logLevel: info
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/labeler
pullPolicy: IfNotPresent
tag: ""
podAnnotations: {}
# Kata Containers detection configuration
# Labeler checks node labels to determine Kata support
# Sets output label: nvsentinel.dgxc.nvidia.com/kata.enabled
#
# Detection logic:
# 1. Check default label: katacontainers.io/kata-runtime
# 2. If kataLabelOverride is set, also check that label
# 3. Label value must be truthy: "true", "enabled", "1", or "yes" (case-insensitive)
# 4. Set output label to "true" if detected, "false" otherwise
#
# Use kataLabelOverride when:
# - Your cluster uses custom labels for Kata detection
# - You have additional Kata runtime indicators
#
# Example: 'io.katacontainers.config.runtime.oci_runtime'
# Leave empty to only check default label
kataLabelOverride: ""
# Pod resource limits and requests
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
################################################################################
# JANITOR CONFIGURATION
#
# Executes node reboots and terminations by calling cloud provider APIs.
# Watches maintenance CRDs created by fault-remediation and performs the
# actual infrastructure operations.
#
# Supports: AWS, GCP, Azure, OCI, kind (for testing), kwok (for testing)
################################################################################
janitor:
# Number of pod replicas (recommend 1 for consistency)
replicaCount: 1
# Container image configuration
image:
repository: ghcr.io/nvidia/nvsentinel/janitor
pullPolicy: IfNotPresent
tag: ""
# Kubernetes service account configuration
serviceAccount:
# Create service account
# Set to false if using an existing service account
create: true
# Automatically mount service account token
automount: true
# Annotations for service account
# Used for cloud provider identity (IRSA, Workload Identity, etc.)
# Examples:
# AWS IRSA:
# eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/nvsentinel-janitor
# GCP Workload Identity:
# iam.gke.io/gcp-service-account: nvsentinel-janitor@PROJECT_ID.iam.gserviceaccount.com
# Azure Workload Identity:
# azure.workload.identity/client-id: "12345678-1234-1234-1234-123456789012"
annotations: {}
# Service account name
# Leave empty to use generated name from chart
name: ""
podAnnotations: {}
podLabels: {}
podSecurityContext: {}
securityContext: {}
# Pod resource limits
# Default is no limits - set based on cluster size
resources: {}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Scheduling configuration
nodeSelector: {}
tolerations: []
affinity: {}
# Autoscaling configuration
# Not typically needed for janitor (low request rate)
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# Janitor controller configuration
config:
# Global timeout for all operations
# Used as default for controllers that don't specify their own
# Should account for cloud provider API call time + node boot time