Skip to content

Commit 0fd7099

Browse files
wadlejitendraarmruphisco
authored
feat: add LastBackupSucceeded Cluster condition
This patch introduces a new Cluster condition, LastBackupSucceeded, with the following semantic: - true if the last backup succeeded - false if the last backup failed - not set if no backups were run yet This allows waiting for at least a backup to be successful upon cluster startup using kubectl: ``` $ kubectl -n <namespace> --for=condition=LastBackupSucceeded=true cluster/<cluster-name> cluster.postgresql.k8s.enterprisedb.io/<cluster-name> condition met ``` It also allows to easily know whether a Cluster has a recent backup, by just inspecting the Cluster status. E2Es were also updated accordingly. Here is an example of conditions in case of success: ``` status: conditions: - reason: Continuous Archiving is Working status: "True" type: ContinuousArchiving - reason: Last backup succeeded status: "True" type: LastBackupSucceeded ``` And here in case of failure: ``` status: conditions: - reason: Continuous Archiving is Working status: "True" type: ContinuousArchiving - reason: Last backup failed status: "False" type: LastBackupSucceeded ``` Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Philippe Scorsolini <philippe.scorsolini@enterprisedb.com>
1 parent a30d49f commit 0fd7099

File tree

8 files changed

+109
-11
lines changed

8 files changed

+109
-11
lines changed

api/v1/cluster_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,8 @@ type ClusterStatus struct {
360360
const (
361361
// ConditionContinuousArchiving this condition archiving :owned by InstanceManager.
362362
ConditionContinuousArchiving ClusterConditionType = "ContinuousArchiving"
363+
// ConditionBackup this condition looking backup status :owned by InstanceManager.
364+
ConditionBackup ClusterConditionType = "LastBackupSucceeded"
363365
)
364366

365367
// ConditionStatus defines conditions of resources

controllers/backup_controller.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"sigs.k8s.io/controller-runtime/pkg/source"
2929

3030
apiv1 "github.com/EnterpriseDB/cloud-native-postgresql/api/v1"
31+
"github.com/EnterpriseDB/cloud-native-postgresql/internal/cmd/manager"
3132
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/management/log"
3233
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/management/postgres"
3334
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/specs"
@@ -165,7 +166,7 @@ func (r *BackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
165166
"pod", pod.Name)
166167

167168
// This backup has been started
168-
err = StartBackup(ctx, r.Client, &backup, pod)
169+
err = StartBackup(ctx, r.Client, &backup, pod, &cluster)
169170
if err != nil {
170171
r.Recorder.Eventf(&backup, "Warning", "Error", "Backup exit with error %v", err)
171172
}
@@ -180,13 +181,14 @@ func StartBackup(
180181
client client.Client,
181182
backup *apiv1.Backup,
182183
pod corev1.Pod,
184+
cluster *apiv1.Cluster,
183185
) error {
184186
// This backup has been started
185187
status := backup.GetStatus()
186188
status.Phase = apiv1.BackupPhaseStarted
187189
status.InstanceID = &apiv1.InstanceID{PodName: pod.Name, ContainerID: pod.Status.ContainerStatuses[0].ContainerID}
188190
if err := postgres.UpdateBackupStatusAndRetry(ctx, client, backup); err != nil {
189-
backup.GetStatus().SetAsFailed(fmt.Errorf("can't update backup: %w", err))
191+
status.SetAsFailed(fmt.Errorf("can't update backup: %w", err))
190192
return err
191193
}
192194
config := ctrl.GetConfigOrDie()
@@ -204,15 +206,21 @@ func StartBackup(
204206
nil,
205207
"/controller/manager",
206208
"backup",
207-
backup.GetName())
209+
backup.GetName(),
210+
)
208211
return err
209212
})
213+
210214
if err != nil {
211215
log.FromContext(ctx).Error(err, "executing backup", "stdout", stdout, "stderr", stderr)
212-
status := backup.GetStatus()
213216
status.SetAsFailed(fmt.Errorf("can't execute backup: %w", err))
214217
status.CommandError = stderr
215218
status.CommandError = stdout
219+
220+
// Update backup status in cluster conditions
221+
if errCond := manager.UpdateCondition(ctx, client, cluster, postgres.BuildBackupCondition(err)); errCond != nil {
222+
log.FromContext(ctx).Error(err, "Error status.UpdateCondition()")
223+
}
216224
return postgres.UpdateBackupStatusAndRetry(ctx, client, backup)
217225
}
218226

docs/src/troubleshooting.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,19 @@ objects [like here](https://kubernetes.io/docs/concepts/workloads/pods/pod-lifec
400400
Cluster exposes `status.conditions` as well. This allows one to 'wait' for a particular
401401
event to occur instead of relying on the overall cluster health state. Available conditions as of now are:
402402

403+
- LastBackupSucceeded
403404
- ContinuousArchiving
404405

405406
### How to wait for a particular condition
406407

408+
- Backup:
407409
```bash
408-
$ kubectl wait --for=condition=ContinuousArchiving cluster/<cluster-name>
410+
$ kubectl wait --for=condition=LastBackupSucceeded cluster/<CLUSTER-NAME> -n <NAMESPACE>
411+
```
412+
413+
- ContinuousArchiving:
414+
```bash
415+
$ kubectl wait --for=condition=ContinuousArchiving cluster/<CLUSTER-NAME> -n <NAMESPACE>
409416
```
410417

411418
Below is a snippet of a `cluster.status` that contains a failing condition.
@@ -418,10 +425,15 @@ $ kubectl get cluster/<cluster-name> -o yaml
418425
status:
419426
conditions:
420427
- message: 'unexpected failure invoking barman-cloud-wal-archive: exit status
421-
4'
428+
2'
422429
reason: Continuous Archiving is Failing
423430
status: "False"
424431
type: ContinuousArchiving
432+
433+
- message: exit status 2
434+
reason: Backup is failed
435+
status: "False"
436+
type: LastBackupSucceeded
425437
```
426438

427439
## Some common issues

internal/cmd/manager/manager.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,15 @@ func customDestination(in *zap.Options) {
126126

127127
// UpdateCondition will allow instance manager to update a particular condition
128128
// if the condition was already updated in the correct state then this is a no-op
129-
func UpdateCondition(ctx context.Context, client client.WithWatch,
129+
func UpdateCondition(ctx context.Context, c client.Client,
130130
cluster *apiv1.Cluster, condition *apiv1.ClusterCondition,
131131
) error {
132132
if cluster == nil && condition == nil {
133133
// if cluster or condition is nil nothing to do here.
134134
return nil
135135
}
136136

137-
existingClusterStatus := cluster.Status
137+
oriCluster := cluster.DeepCopy()
138138
var exCondition *apiv1.ClusterCondition
139139
for i, c := range cluster.Status.Conditions {
140140
if c.Type == condition.Type {
@@ -148,8 +148,9 @@ func UpdateCondition(ctx context.Context, client client.WithWatch,
148148
cluster.Status.Conditions = append(cluster.Status.Conditions, *condition)
149149
}
150150

151-
if !reflect.DeepEqual(existingClusterStatus, cluster.Status) {
152-
if err := client.Status().Update(ctx, cluster); err != nil {
151+
if !reflect.DeepEqual(oriCluster.Status, cluster.Status) {
152+
// To avoid conflict using patch instead of update
153+
if err := c.Status().Patch(ctx, cluster, client.MergeFrom(oriCluster)); err != nil {
153154
return err
154155
}
155156
}

pkg/management/postgres/backup.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"sigs.k8s.io/controller-runtime/pkg/client"
2727

2828
apiv1 "github.com/EnterpriseDB/cloud-native-postgresql/api/v1"
29+
"github.com/EnterpriseDB/cloud-native-postgresql/internal/cmd/manager"
2930
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/fileutils"
3031
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/management/barman"
3132
barmanCapabilities "github.com/EnterpriseDB/cloud-native-postgresql/pkg/management/barman/capabilities"
@@ -279,6 +280,13 @@ func (b *BackupCommand) run(ctx context.Context) {
279280
b.Log.Error(err, "Backup failed")
280281
backupStatus.SetAsFailed(err)
281282
b.Recorder.Event(b.Backup, "Normal", "Failed", "Backup failed")
283+
284+
// Update backup status in cluster conditions
285+
if errCond := manager.UpdateCondition(ctx, b.Client,
286+
b.Cluster, BuildBackupCondition(err)); errCond != nil {
287+
b.Log.Error(errCond, "Error status.UpdateCondition()")
288+
}
289+
282290
if err := UpdateBackupStatusAndRetry(ctx, b.Client, b.Backup); err != nil {
283291
b.Log.Error(err, "Can't mark backup as failed")
284292
}
@@ -290,6 +298,12 @@ func (b *BackupCommand) run(ctx context.Context) {
290298
backupStatus.SetAsCompleted()
291299
b.Recorder.Event(b.Backup, "Normal", "Completed", "Backup completed")
292300

301+
// Update backup status in cluster conditions
302+
if errCond := manager.UpdateCondition(ctx, b.Client,
303+
b.Cluster, BuildBackupCondition(nil)); errCond != nil {
304+
b.Log.Error(errCond, "Error status.UpdateCondition()")
305+
}
306+
293307
// Delete backups per policy
294308
if b.Cluster.Spec.Backup.RetentionPolicy != "" {
295309
b.Log.Info("Applying backup retention policy",
@@ -413,3 +427,21 @@ func (b *BackupCommand) updateCompletedBackupStatus(backupList *catalog.Catalog)
413427
backupStatus.BeginLSN = latestBackup.BeginLSN
414428
backupStatus.EndLSN = latestBackup.EndLSN
415429
}
430+
431+
// BuildBackupCondition build a backup conditions
432+
func BuildBackupCondition(err error) *apiv1.ClusterCondition {
433+
if err != nil {
434+
return &apiv1.ClusterCondition{
435+
Type: apiv1.ConditionBackup,
436+
Status: apiv1.ConditionFalse,
437+
Reason: "Last backup failed",
438+
Message: err.Error(),
439+
}
440+
}
441+
return &apiv1.ClusterCondition{
442+
Type: apiv1.ConditionBackup,
443+
Status: apiv1.ConditionTrue,
444+
Reason: "Last backup succeeded",
445+
Message: "",
446+
}
447+
}

tests/e2e/asserts_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,7 @@ func prepareClusterForPITROnMinio(
14171417
})
14181418
AssertArchiveWalOnMinio(namespace, clusterName)
14191419
AssertArchiveConditionMet(namespace, clusterName, "5m")
1420+
AssertBackupConditionInClusterStatus(namespace, clusterName)
14201421
}
14211422

14221423
func prepareClusterForPITROnAzureBlob(namespace, clusterName, backupSampleFile,
@@ -1451,6 +1452,7 @@ func prepareClusterForPITROnAzureBlob(namespace, clusterName, backupSampleFile,
14511452
})
14521453
AssertArchiveWalOnAzureBlob(namespace, clusterName, azStorageAccount, azStorageKey)
14531454
AssertArchiveConditionMet(namespace, clusterName, "5m")
1455+
AssertBackupConditionInClusterStatus(namespace, clusterName)
14541456
}
14551457

14561458
func prepareClusterOnAzurite(namespace, clusterName, clusterSampleFile string) {
@@ -1497,6 +1499,7 @@ func prepareClusterBackupOnAzurite(namespace, clusterName, clusterSampleFile, ba
14971499
return cluster.Status.FirstRecoverabilityPoint, err
14981500
}, 30).ShouldNot(BeEmpty())
14991501
})
1502+
AssertBackupConditionInClusterStatus(namespace, clusterName)
15001503
}
15011504

15021505
func prepareClusterForPITROnAzurite(namespace, clusterName, backupSampleFile string, currentTimestamp *string) {
@@ -2014,3 +2017,16 @@ func CreateResourceFromFile(namespace, sampleFilePath string) {
20142017
return nil
20152018
}, 60, 5).Should(BeNil())
20162019
}
2020+
2021+
func AssertBackupConditionInClusterStatus(namespace, clusterName string) {
2022+
By(fmt.Sprintf("waiting for backup condition status in cluster '%v'", clusterName), func() {
2023+
Eventually(func() (string, error) {
2024+
getBackupCondition, err := testsUtils.GetConditionsInClusterStatus(
2025+
namespace, clusterName, env, apiv1.ConditionBackup)
2026+
if err != nil {
2027+
return "", err
2028+
}
2029+
return string(getBackupCondition.Status), nil
2030+
}, 300, 5).Should(BeEquivalentTo("True"))
2031+
})
2032+
}

tests/e2e/backup_restore_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ var _ = Describe("Backup and restore", Label(tests.LabelBackupRestore), func() {
157157
// There should be a backup resource and
158158
By("backing up a cluster and verifying it exists on minio", func() {
159159
testUtils.ExecuteBackup(namespace, backupFile, env)
160-
160+
AssertBackupConditionInClusterStatus(namespace, clusterName)
161161
Eventually(func() (int, error) {
162162
return testUtils.CountFilesOnMinio(namespace, minioClientName, "data.tar")
163163
}, 30).Should(BeEquivalentTo(1))
@@ -313,6 +313,7 @@ var _ = Describe("Backup and restore", Label(tests.LabelBackupRestore), func() {
313313
By("uploading a backup", func() {
314314
// We create a backup
315315
testUtils.ExecuteBackup(namespace, backupFile, env)
316+
AssertBackupConditionInClusterStatus(namespace, clusterName)
316317

317318
// Verifying file called data.tar should be available on Azure blob storage
318319
Eventually(func() (int, error) {
@@ -602,6 +603,7 @@ var _ = Describe("Clusters Recovery From Barman Object Store", Label(tests.Label
602603
// There should be a backup resource and
603604
By("backing up a cluster and verifying it exists on minio", func() {
604605
testUtils.ExecuteBackup(namespace, sourceBackupFileMinio, env)
606+
AssertBackupConditionInClusterStatus(namespace, clusterName)
605607
Eventually(func() (int, error) {
606608
return testUtils.CountFilesOnMinio(namespace, minioClientName, "data.tar")
607609
}, 30).Should(BeEquivalentTo(1))
@@ -645,6 +647,7 @@ var _ = Describe("Clusters Recovery From Barman Object Store", Label(tests.Label
645647
// There should be a backup resource and
646648
By("backing up a cluster and verifying it exists on minio", func() {
647649
testUtils.ExecuteBackup(namespace, sourceBackupFileMinio, env)
650+
AssertBackupConditionInClusterStatus(namespace, clusterName)
648651
Eventually(func() (int, error) {
649652
return testUtils.CountFilesOnMinio(namespace, minioClientName, "data.tar")
650653
}, 30).Should(BeEquivalentTo(1))
@@ -696,6 +699,7 @@ var _ = Describe("Clusters Recovery From Barman Object Store", Label(tests.Label
696699
By("backing up a cluster and verifying it exists on azure blob storage", func() {
697700
// Create the backup
698701
testUtils.ExecuteBackup(namespace, sourceBackupFileAzure, env)
702+
AssertBackupConditionInClusterStatus(namespace, clusterName)
699703
// Verifying file called data.tar should be available on Azure blob storage
700704
Eventually(func() (int, error) {
701705
return testUtils.CountFilesOnAzureBlobStorage(azStorageAccount, azStorageKey, clusterName, "data.tar")
@@ -767,6 +771,7 @@ var _ = Describe("Clusters Recovery From Barman Object Store", Label(tests.Label
767771
By("backing up a cluster and verifying it exists on azure blob storage", func() {
768772
// We create a Backup
769773
testUtils.ExecuteBackup(namespace, sourceBackupFileAzureSAS, env)
774+
AssertBackupConditionInClusterStatus(namespace, clusterName)
770775
// Verifying file called data.tar should be available on Azure blob storage
771776
Eventually(func() (int, error) {
772777
return testUtils.CountFilesOnAzureBlobStorage(azStorageAccount, azStorageKey, clusterName, "data.tar")

tests/utils/backup.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,3 +385,25 @@ func CountFilesOnAzuriteBlobStorage(
385385
err = json.Unmarshal([]byte(out), &arr)
386386
return len(arr), err
387387
}
388+
389+
// GetConditionsInClusterStatus get conditions values as given type from cluster object status
390+
func GetConditionsInClusterStatus(
391+
namespace,
392+
clusterName string,
393+
env *TestingEnvironment,
394+
conditionType apiv1.ClusterConditionType) (*apiv1.ClusterCondition, error) {
395+
var cluster *apiv1.Cluster
396+
var err error
397+
var backupConditionInCluster *apiv1.ClusterCondition
398+
cluster, err = env.GetCluster(namespace, clusterName)
399+
if err != nil {
400+
return &apiv1.ClusterCondition{}, nil
401+
}
402+
for index, cond := range cluster.Status.Conditions {
403+
if cond.Type == conditionType {
404+
backupConditionInCluster = &cluster.Status.Conditions[index]
405+
break
406+
}
407+
}
408+
return backupConditionInCluster, nil
409+
}

0 commit comments

Comments
 (0)