Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,8 @@ type DCGMExporterServiceConfig struct {
// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor
// deployed for DCGM Exporter
type DCGMExporterServiceMonitorConfig struct {
// Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter
// Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter.
// Defaults to true. Set to false to explicitly disable ServiceMonitor creation.
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of NVIDIA DCGM Exporter ServiceMonitor"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Expand Down Expand Up @@ -2256,11 +2257,11 @@ func (dcgm *DCGMSpec) IsEnabled() bool {
return *dcgm.Enabled
}

// IsEnabled returns true if ServiceMonitor for DCGM Exporter is enabled through gpu-operator
// IsEnabled returns true if ServiceMonitor for DCGM Exporter should be created (default).
func (sm *DCGMExporterServiceMonitorConfig) IsEnabled() bool {
if sm.Enabled == nil {
// ServiceMonitor for DCGM Exporter is disabled by default
return false
// default is true if not specified by user
return true
}
return *sm.Enabled
}
Expand Down
46 changes: 25 additions & 21 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -4833,8 +4833,9 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {

if n.stateNames[state] == "state-dcgm-exporter" {
serviceMonitor := n.singleton.Spec.DCGMExporter.ServiceMonitor
// Check if ServiceMonitor is disabled and cleanup resource if exists
if serviceMonitor == nil || !serviceMonitor.IsEnabled() {

// Check if ServiceMonitor is explicitly disabled and cleanup resource if it exists
if serviceMonitor != nil && !serviceMonitor.IsEnabled() {
if !serviceMonitorCRDExists {
return gpuv1.Ready, nil
}
Expand All @@ -4846,33 +4847,36 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

// If Prometheus CRD is missing, skip gracefully
if !serviceMonitorCRDExists {
logger.Error(fmt.Errorf("couldn't find ServiceMonitor CRD"), "Install Prometheus and necessary CRDs for gathering GPU metrics!")
return gpuv1.NotReady, nil
logger.V(1).Info("ServiceMonitor CRD not found, skipping DCGM Exporter ServiceMonitor creation")
return gpuv1.Ready, nil
}

// Apply custom edits for DCGM Exporter
if serviceMonitor.Interval != "" {
obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval
}
// Apply custom edits for DCGM Exporter when a serviceMonitor config is provided
if serviceMonitor != nil {
if serviceMonitor.Interval != "" {
obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval
}

if serviceMonitor.HonorLabels != nil {
obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels
}
if serviceMonitor.HonorLabels != nil {
obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels
}

if serviceMonitor.AdditionalLabels != nil {
for key, value := range serviceMonitor.AdditionalLabels {
obj.Labels[key] = value
if serviceMonitor.AdditionalLabels != nil {
for key, value := range serviceMonitor.AdditionalLabels {
obj.Labels[key] = value
}
}
}
if serviceMonitor.Relabelings != nil {
relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings))
for i, relabel := range serviceMonitor.Relabelings {
if relabel != nil {
relabelConfigs[i] = *relabel
if serviceMonitor.Relabelings != nil {
relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings))
for i, relabel := range serviceMonitor.Relabelings {
if relabel != nil {
relabelConfigs[i] = *relabel
}
}
obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs
}
obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs
}
}
if n.stateNames[state] == "state-operator-metrics" || n.stateNames[state] == "state-node-status-exporter" {
Expand Down
38 changes: 36 additions & 2 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1320,7 +1320,7 @@ func TestServiceMonitor(t *testing.T) {
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM enabled, CRD missing -> NotReady",
description: "dcgm-exporter SM enabled, CRD missing -> Ready (skip gracefully)",
stateName: "state-dcgm-exporter",
k8sObjects: nil,
clusterPolicySpec: gpuv1.ClusterPolicySpec{
Expand All @@ -1329,9 +1329,43 @@ func TestServiceMonitor(t *testing.T) {
ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{Enabled: ptr.To(true)},
},
},
expectedState: gpuv1.NotReady,
expectedState: gpuv1.Ready,
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM enabled (default nil), CRD missing -> Ready (skip gracefully)",
stateName: "state-dcgm-exporter",
k8sObjects: nil,
clusterPolicySpec: gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Enabled: ptr.To(true),
},
},
expectedState: gpuv1.Ready,
expectedServiceMonitor: nil,
},
{
description: "dcgm-exporter SM enabled (default nil), CRD present -> Ready (created)",
stateName: "state-dcgm-exporter",
k8sObjects: []client.Object{serviceMonitorCRD},
clusterPolicySpec: gpuv1.ClusterPolicySpec{
DCGMExporter: gpuv1.DCGMExporterSpec{
Enabled: ptr.To(true),
},
},
expectedState: gpuv1.Ready,
expectedServiceMonitor: &promv1.ServiceMonitor{
ObjectMeta: metav1.ObjectMeta{
Name: testServiceMonitor,
Namespace: testNamespace,
Labels: nil,
},
Spec: promv1.ServiceMonitorSpec{
NamespaceSelector: promv1.NamespaceSelector{MatchNames: []string{testNamespace}},
Endpoints: []promv1.Endpoint{{}},
},
},
},
{
description: "dcgm-exporter SM disabled, CRD present -> Disabled (delete if exists)",
stateName: "state-dcgm-exporter",
Expand Down
2 changes: 1 addition & 1 deletion deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ dcgmExporter:
service:
internalTrafficPolicy: Cluster
serviceMonitor:
enabled: false
enabled: true
interval: 15s
honorLabels: false
additionalLabels: {}
Expand Down