Skip to content

Commit 9cefc13

Browse files
authored
MTSRE-1521: Collect metrics for reconcile errors (#471)
* MTSRE-1521: Collect metrics for reconcile errors * updated with review requests
1 parent 61fd0e6 commit 9cefc13

18 files changed

Lines changed: 409 additions & 43 deletions

cmd/addon-operator-manager/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ func initReconcilers(mgr ctrl.Manager,
129129
aictrl.WithLog{Log: addonInstancePhaseLog.WithName("checkHeartbeat")},
130130
),
131131
},
132+
aictrl.WithRecorder{Recorder: recorder},
132133
)
133134

134135
if err := addonInstanceCtrl.SetupWithManager(mgr); err != nil {

internal/controllers/addon/addon_deletion_reconciler.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
ctrl "sigs.k8s.io/controller-runtime"
1010

1111
addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
12+
"github.com/openshift/addon-operator/internal/controllers"
13+
"github.com/openshift/addon-operator/internal/metrics"
1214
)
1315

1416
const (
@@ -34,6 +36,7 @@ func (c defaultClock) Now() time.Time {
3436
type addonDeletionReconciler struct {
3537
clock clock
3638
handlers []addonDeletionHandler
39+
recorder *metrics.Recorder
3740
}
3841

3942
func (r *addonDeletionReconciler) Reconcile(ctx context.Context, addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
@@ -52,13 +55,17 @@ func (r *addonDeletionReconciler) Reconcile(ctx context.Context, addon *addonsv1
5255
// We set ReadyToBeDeleted=false status condition in response to the delete signal received from OCM.
5356
reportAddonReadyToBeDeletedStatus(addon, metav1.ConditionFalse)
5457

58+
reconErr := metrics.NewReconcileError("addon", r.recorder, true)
59+
5560
for _, handler := range r.handlers {
5661
if err := handler.NotifyAddon(ctx, addon); err != nil {
62+
err = reconErr.Join(err, controllers.ErrNotifyAddon)
5763
return ctrl.Result{}, err
5864
}
5965
// If ack is received from the underlying addon, we report ReadyToBeDeleted = true.
6066
ackReceived, err := handler.AckReceivedFromAddon(ctx, addon)
6167
if err != nil {
68+
err = reconErr.Join(err, controllers.ErrAckReceivedFromAddon)
6269
return ctrl.Result{}, err
6370
}
6471
if ackReceived {

internal/controllers/addon/addon_instance_reconciler.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"fmt"
66

77
"k8s.io/apimachinery/pkg/api/equality"
8-
"k8s.io/apimachinery/pkg/api/errors"
8+
apiErrors "k8s.io/apimachinery/pkg/api/errors"
99
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1010
"k8s.io/apimachinery/pkg/runtime"
1111
ctrl "sigs.k8s.io/controller-runtime"
@@ -15,20 +15,24 @@ import (
1515

1616
addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
1717
"github.com/openshift/addon-operator/internal/controllers"
18+
"github.com/openshift/addon-operator/internal/metrics"
1819
)
1920

2021
const ADDON_INSTANCE_RECONCILER_NAME = "addonInstanceReconciler"
2122

2223
type addonInstanceReconciler struct {
23-
client client.Client
24-
scheme *runtime.Scheme
24+
client client.Client
25+
scheme *runtime.Scheme
26+
recorder *metrics.Recorder
2527
}
2628

2729
func (r *addonInstanceReconciler) Reconcile(ctx context.Context,
2830
addon *addonsv1alpha1.Addon) (reconcile.Result, error) {
31+
reconErr := metrics.NewReconcileError("addon", r.recorder, true)
2932
// Ensure the creation of the corresponding AddonInstance in .spec.install.olmOwnNamespace/.spec.install.olmAllNamespaces namespace
3033
if err := r.ensureAddonInstance(ctx, addon); err != nil {
31-
return ctrl.Result{}, fmt.Errorf("failed to ensure the creation of addoninstance: %w", err)
34+
err = reconErr.Join(err, controllers.ErrEnsureCreateAddonInstance)
35+
return ctrl.Result{}, err
3236
}
3337
return reconcile.Result{}, nil
3438
}
@@ -73,7 +77,7 @@ func (r *addonInstanceReconciler) reconcileAddonInstance(
7377
ctx context.Context, desiredAddonInstance *addonsv1alpha1.AddonInstance) error {
7478
currentAddonInstance := &addonsv1alpha1.AddonInstance{}
7579
err := r.client.Get(ctx, client.ObjectKeyFromObject(desiredAddonInstance), currentAddonInstance)
76-
if errors.IsNotFound(err) {
80+
if apiErrors.IsNotFound(err) {
7781
return r.client.Create(ctx, desiredAddonInstance)
7882
}
7983
if err != nil {

internal/controllers/addon/addon_reconciler_options.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ func (w WithPackageOperatorReconciler) ApplyToAddonReconciler(config *AddonRecon
4141
Scheme: w.Scheme,
4242
ClusterID: config.ClusterExternalID,
4343
OcmClusterInfo: config.GetOCMClusterInfo,
44+
recorder: config.Recorder,
4445
}
4546
config.subReconcilers = append(config.subReconcilers, poReconciler)
4647
}

internal/controllers/addon/controller.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,35 +97,41 @@ func NewAddonReconciler(
9797
&legacyDeletionHandler{client: client, uncachedClient: uncachedClient},
9898
&addonInstanceDeletionHandler{client: client},
9999
},
100+
recorder: recorder,
100101
},
101102
// Step 2: Reconcile Namespace
102103
&namespaceReconciler{
103-
client: client,
104-
scheme: scheme,
104+
client: client,
105+
scheme: scheme,
106+
recorder: recorder,
105107
},
106108
// Step 3: Reconcile Addon pull secrets
107109
&addonSecretPropagationReconciler{
108110
cachedClient: client,
109111
uncachedClient: uncachedClient,
110112
scheme: scheme,
111113
addonOperatorNamespace: addonOperatorNamespace,
114+
recorder: recorder,
112115
},
113116
// Step 4: Reconcile AddonInstance object
114117
&addonInstanceReconciler{
115-
client: client,
116-
scheme: scheme,
118+
client: client,
119+
scheme: scheme,
120+
recorder: recorder,
117121
},
118122
// Step 5: Reconcile OLM objects
119123
&olmReconciler{
120124
client: client,
121125
uncachedClient: uncachedClient,
122126
scheme: scheme,
123127
operatorResourceHandler: operatorResourceHandler,
128+
recorder: recorder,
124129
},
125130
// Step 6: Reconcile Monitoring Federation
126131
&monitoringFederationReconciler{
127-
client: client,
128-
scheme: scheme,
132+
client: client,
133+
scheme: scheme,
134+
recorder: recorder,
129135
},
130136
},
131137
}
@@ -273,9 +279,11 @@ func (r *AddonReconciler) Reconcile(
273279
) (ctrl.Result, error) {
274280
logger := r.Log.WithValues("addon", req.NamespacedName.String())
275281
ctx = controllers.ContextWithLogger(ctx, logger)
282+
reconErr := metrics.NewReconcileError("addon", r.Recorder, false)
276283

277284
addon := &addonsv1alpha1.Addon{}
278285
if err := r.Get(ctx, req.NamespacedName, addon); err != nil {
286+
reconErr.Report(controllers.ErrGetAddon, addon.Name)
279287
return ctrl.Result{}, client.IgnoreNotFound(err)
280288
}
281289

@@ -287,6 +295,10 @@ func (r *AddonReconciler) Reconcile(
287295
}
288296
errors := r.syncWithExternalAPIs(ctx, logger, addon)
289297

298+
if errors.ErrorOrNil() != nil {
299+
reconErr.Report(controllers.ErrSyncWithExternalAPIs, addon.Name)
300+
}
301+
290302
// append reconcilerErr
291303
errors = multierror.Append(errors, reconcileErr)
292304

@@ -322,6 +334,8 @@ func (r *AddonReconciler) reconcile(ctx context.Context, addon *addonsv1alpha1.A
322334
log logr.Logger,
323335
) (ctrl.Result, error) {
324336
ctx = controllers.ContextWithLogger(ctx, log)
337+
reconErr := metrics.NewReconcileError("addon", r.Recorder, false)
338+
subReconErr := metrics.NewReconcileError("addon", r.Recorder, true)
325339
// Handle addon deletion before checking for pause condition.
326340
// This allows even paused addons to be deleted.
327341
if !addon.DeletionTimestamp.IsZero() {
@@ -362,13 +376,15 @@ func (r *AddonReconciler) reconcile(ctx context.Context, addon *addonsv1alpha1.A
362376
if !controllerutil.ContainsFinalizer(addon, cacheFinalizer) {
363377
controllerutil.AddFinalizer(addon, cacheFinalizer)
364378
if err := r.Update(ctx, addon); err != nil {
379+
reconErr.Report(controllers.ErrUpdateAddon, addon.Name)
365380
return ctrl.Result{}, fmt.Errorf("failed to add finalizer: %w", err)
366381
}
367382
}
368383

369384
// Run each sub reconciler serially
370385
for _, reconciler := range r.subReconcilers {
371386
if result, err := reconciler.Reconcile(ctx, addon); err != nil {
387+
subReconErr.Report(err, addon.Name)
372388
return ctrl.Result{}, fmt.Errorf("%s : failed to reconcile : %w", reconciler.Name(), err)
373389
} else if !result.IsZero() {
374390
return result, nil

internal/controllers/addon/controller_test.go

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@ import (
1414
ctrl "sigs.k8s.io/controller-runtime"
1515
"sigs.k8s.io/controller-runtime/pkg/reconcile"
1616

17+
promTestUtil "github.com/prometheus/client_golang/prometheus/testutil"
18+
1719
addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
20+
"github.com/openshift/addon-operator/internal/controllers"
21+
"github.com/openshift/addon-operator/internal/metrics"
1822
"github.com/openshift/addon-operator/internal/ocm"
1923
"github.com/openshift/addon-operator/internal/ocm/ocmtest"
2024
"github.com/openshift/addon-operator/internal/testutil"
@@ -26,7 +30,10 @@ type reconcileErrorTestCase struct {
2630
statusUpdateErrPresent bool
2731
}
2832

29-
var _ addonReconciler = (*mockSubReconciler)(nil)
33+
var (
34+
_ addonReconciler = (*mockSubReconciler)(nil)
35+
errMockSubReconcile = errors.New("failed to reconcile")
36+
)
3037

3138
type mockSubReconciler struct {
3239
returnErr bool
@@ -38,7 +45,7 @@ func (m *mockSubReconciler) Name() string {
3845

3946
func (m *mockSubReconciler) Reconcile(ctx context.Context, addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
4047
if m.returnErr {
41-
return ctrl.Result{}, errors.New("failed to reconcile")
48+
return ctrl.Result{}, errMockSubReconcile
4249
}
4350
return ctrl.Result{}, nil
4451
}
@@ -86,13 +93,17 @@ func TestReconcileErrorHandling(t *testing.T) {
8693
statusUpdateErrPresent: true,
8794
},
8895
}
89-
for _, testCase := range testCases {
96+
97+
for idx, testCase := range testCases {
9098
client := testutil.NewClient()
9199
ocmClient := ocmtest.NewClient()
100+
recorder := metrics.NewRecorder(true, fmt.Sprintf("clusterID-%v", idx))
101+
92102
r := AddonReconciler{
93103
Client: client,
94104
ocmClient: ocmClient,
95105
Log: logr.Discard(),
106+
Recorder: recorder,
96107
subReconcilers: []addonReconciler{},
97108
}
98109

@@ -136,6 +147,32 @@ func TestReconcileErrorHandling(t *testing.T) {
136147
assert.True(t, ok, "expected multi error")
137148
assert.Equal(t, expectedNumErrors(testCase), multiErr.Len())
138149
}
150+
151+
metric := recorder.GetReconcileErrorMetric()
152+
assert.NotNil(t, metric)
153+
154+
if testCase.externalAPISyncErrPresent {
155+
// Ensure sync error during reconcile was collected as a metric
156+
controllerMetricVal := promTestUtil.ToFloat64(
157+
metric.WithLabelValues(
158+
"addon",
159+
controllers.ErrSyncWithExternalAPIs.Error(),
160+
addon.Name,
161+
),
162+
)
163+
assert.True(t, controllerMetricVal > 0)
164+
}
165+
if testCase.reconcilerErrPresent {
166+
// Ensure reconcile error was collected as a metric
167+
controllerMetricVal := promTestUtil.ToFloat64(
168+
metric.WithLabelValues(
169+
"addon",
170+
errMockSubReconcile.Error(),
171+
addon.Name,
172+
),
173+
)
174+
assert.True(t, controllerMetricVal > 0)
175+
}
139176
}
140177
}
141178

internal/controllers/addon/monitoring_federation_reconciler.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,21 @@ import (
1919

2020
addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
2121
"github.com/openshift/addon-operator/internal/controllers"
22+
"github.com/openshift/addon-operator/internal/metrics"
2223
)
2324

2425
const MONITORING_FEDERATION_RECONCILER_NAME = "monitoringFederationReconciler"
2526

2627
type monitoringFederationReconciler struct {
27-
client client.Client
28-
scheme *runtime.Scheme
28+
client client.Client
29+
scheme *runtime.Scheme
30+
recorder *metrics.Recorder
2931
}
3032

3133
func (r *monitoringFederationReconciler) Reconcile(ctx context.Context,
3234
addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
3335
log := controllers.LoggerFromContext(ctx)
36+
reconErr := metrics.NewReconcileError("addon", r.recorder, true)
3437

3538
// Possibly ensure monitoring federation
3639
// Normally this would be configured before the addon workload is installed
@@ -44,14 +47,19 @@ func (r *monitoringFederationReconciler) Reconcile(ctx context.Context,
4447

4548
return ctrl.Result{}, nil
4649
} else if err != nil {
47-
return ctrl.Result{}, fmt.Errorf("failed to ensure ServiceMonitor: %w", err)
50+
err = reconErr.Join(err, controllers.ErrEnsureCreateServiceMonitor)
51+
return ctrl.Result{}, err
4852
} else if !result.IsZero() {
4953
return result, nil
5054
}
5155

5256
// Remove possibly unwanted monitoring federation
5357
if err := r.ensureDeletionOfUnwantedMonitoringFederation(ctx, addon); err != nil {
54-
return ctrl.Result{}, fmt.Errorf("failed to ensure deletion of unwanted ServiceMonitors: %w", err)
58+
err = reconErr.Join(
59+
err,
60+
controllers.ErrEnsureDeleteServiceMonitor,
61+
)
62+
return ctrl.Result{}, err
5563
}
5664
return reconcile.Result{}, nil
5765
}

internal/controllers/addon/monitoring_stack_reconciler.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,17 @@ import (
2020

2121
addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
2222
"github.com/openshift/addon-operator/internal/controllers"
23+
"github.com/openshift/addon-operator/internal/metrics"
2324
)
2425

2526
const MONITORING_STACK_RECONCILER_NAME = "monitoringStackReconciler"
2627

2728
var errMonitoringStackSpecNotFound = fmt.Errorf("monitoring stack spec not found")
2829

2930
type monitoringStackReconciler struct {
30-
client client.Client
31-
scheme *runtime.Scheme
31+
client client.Client
32+
scheme *runtime.Scheme
33+
recorder *metrics.Recorder
3234
}
3335

3436
func (r *monitoringStackReconciler) Name() string {
@@ -37,13 +39,15 @@ func (r *monitoringStackReconciler) Name() string {
3739

3840
func (r *monitoringStackReconciler) Reconcile(ctx context.Context,
3941
addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
42+
reconErr := metrics.NewReconcileError("addon", r.recorder, true)
4043

4144
// ensure creation of MonitoringStack object
4245
latestMonitoringStack, err := r.ensureMonitoringStack(ctx, addon)
4346
if err != nil {
4447
if errors.Is(err, errMonitoringStackSpecNotFound) {
4548
return reconcile.Result{}, nil
4649
}
50+
err = reconErr.Join(err, controllers.ErrEnsureCreateMonitoringStack)
4751
return reconcile.Result{}, err
4852
}
4953

internal/controllers/addon/namespace_reconciler.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@ import (
1616

1717
addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
1818
"github.com/openshift/addon-operator/internal/controllers"
19+
"github.com/openshift/addon-operator/internal/metrics"
1920
)
2021

2122
const NAMESPACE_RECONCILER_NAME = "namespaceReconciler"
2223

2324
type namespaceReconciler struct {
24-
client client.Client
25-
scheme *runtime.Scheme
25+
client client.Client
26+
scheme *runtime.Scheme
27+
recorder *metrics.Recorder
2628
}
2729

2830
func (r *namespaceReconciler) Reconcile(ctx context.Context,

0 commit comments

Comments
 (0)