diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index f79497e64..d42d1b048 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1017,7 +1017,8 @@ type DCGMExporterServiceConfig struct { // DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor // deployed for DCGM Exporter type DCGMExporterServiceMonitorConfig struct { - // Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter + // Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter. + // Defaults to true. Set to false to explicitly disable ServiceMonitor creation. // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of NVIDIA DCGM Exporter ServiceMonitor" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" @@ -2256,11 +2257,11 @@ func (dcgm *DCGMSpec) IsEnabled() bool { return *dcgm.Enabled } -// IsEnabled returns true if ServiceMonitor for DCGM Exporter is enabled through gpu-operator +// IsEnabled returns true if ServiceMonitor for DCGM Exporter should be created (default). func (sm *DCGMExporterServiceMonitorConfig) IsEnabled() bool { if sm.Enabled == nil { - // ServiceMonitor for DCGM Exporter is disabled by default - return false + // default is true if not specified by user + return true } return *sm.Enabled } diff --git a/controllers/object_controls.go b/controllers/object_controls.go index b436bcab1..bf913049e 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -4833,8 +4833,9 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { if n.stateNames[state] == "state-dcgm-exporter" { serviceMonitor := n.singleton.Spec.DCGMExporter.ServiceMonitor - // Check if ServiceMonitor is disabled and cleanup resource if exists - if serviceMonitor == nil || !serviceMonitor.IsEnabled() { + + // Check if ServiceMonitor is explicitly disabled and cleanup resource if it exists + if serviceMonitor != nil && !serviceMonitor.IsEnabled() { if !serviceMonitorCRDExists { return gpuv1.Ready, nil } @@ -4846,33 +4847,36 @@ func ServiceMonitor(n ClusterPolicyController) (gpuv1.State, error) { return gpuv1.Disabled, nil } + // If Prometheus CRD is missing, skip gracefully if !serviceMonitorCRDExists { - logger.Error(fmt.Errorf("couldn't find ServiceMonitor CRD"), "Install Prometheus and necessary CRDs for gathering GPU metrics!") - return gpuv1.NotReady, nil + logger.V(1).Info("ServiceMonitor CRD not found, skipping DCGM Exporter ServiceMonitor creation") + return gpuv1.Ready, nil } - // Apply custom edits for DCGM Exporter - if serviceMonitor.Interval != "" { - obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval - } + // Apply custom edits for DCGM Exporter when a serviceMonitor config is provided + if serviceMonitor != nil { + if serviceMonitor.Interval != "" { + obj.Spec.Endpoints[0].Interval = serviceMonitor.Interval + } - if serviceMonitor.HonorLabels != nil { - obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels - } + if serviceMonitor.HonorLabels != nil { + obj.Spec.Endpoints[0].HonorLabels = *serviceMonitor.HonorLabels + } - if serviceMonitor.AdditionalLabels != nil { - for key, value := range serviceMonitor.AdditionalLabels { - obj.Labels[key] = value + if serviceMonitor.AdditionalLabels != nil { + for key, value := range serviceMonitor.AdditionalLabels { + obj.Labels[key] = value + } } - } - if serviceMonitor.Relabelings != nil { - relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings)) - for i, relabel := range serviceMonitor.Relabelings { - if relabel != nil { - relabelConfigs[i] = *relabel + if serviceMonitor.Relabelings != nil { + relabelConfigs := make([]promv1.RelabelConfig, len(serviceMonitor.Relabelings)) + for i, relabel := range serviceMonitor.Relabelings { + if relabel != nil { + relabelConfigs[i] = *relabel + } } + obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs } - obj.Spec.Endpoints[0].RelabelConfigs = relabelConfigs } } if n.stateNames[state] == "state-operator-metrics" || n.stateNames[state] == "state-node-status-exporter" { diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index f6df7340d..0a98170b7 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -1320,7 +1320,7 @@ func TestServiceMonitor(t *testing.T) { expectedServiceMonitor: nil, }, { - description: "dcgm-exporter SM enabled, CRD missing -> NotReady", + description: "dcgm-exporter SM enabled, CRD missing -> Ready (skip gracefully)", stateName: "state-dcgm-exporter", k8sObjects: nil, clusterPolicySpec: gpuv1.ClusterPolicySpec{ @@ -1329,9 +1329,43 @@ func TestServiceMonitor(t *testing.T) { ServiceMonitor: &gpuv1.DCGMExporterServiceMonitorConfig{Enabled: ptr.To(true)}, }, }, - expectedState: gpuv1.NotReady, + expectedState: gpuv1.Ready, + expectedServiceMonitor: nil, + }, + { + description: "dcgm-exporter SM enabled (default nil), CRD missing -> Ready (skip gracefully)", + stateName: "state-dcgm-exporter", + k8sObjects: nil, + clusterPolicySpec: gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Enabled: ptr.To(true), + }, + }, + expectedState: gpuv1.Ready, expectedServiceMonitor: nil, }, + { + description: "dcgm-exporter SM enabled (default nil), CRD present -> Ready (created)", + stateName: "state-dcgm-exporter", + k8sObjects: []client.Object{serviceMonitorCRD}, + clusterPolicySpec: gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + Enabled: ptr.To(true), + }, + }, + expectedState: gpuv1.Ready, + expectedServiceMonitor: &promv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: testServiceMonitor, + Namespace: testNamespace, + Labels: nil, + }, + Spec: promv1.ServiceMonitorSpec{ + NamespaceSelector: promv1.NamespaceSelector{MatchNames: []string{testNamespace}}, + Endpoints: []promv1.Endpoint{{}}, + }, + }, + }, { description: "dcgm-exporter SM disabled, CRD present -> Disabled (delete if exists)", stateName: "state-dcgm-exporter", diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 222b0e389..43ae88f78 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -290,7 +290,7 @@ dcgmExporter: service: internalTrafficPolicy: Cluster serviceMonitor: - enabled: false + enabled: true interval: 15s honorLabels: false additionalLabels: {}