Skip to content

Commit 560725d

Browse files
wadlejitendraarmru
andauthored
test(AKS): improves E2E test resiliency to transient network failures
This patch wraps several AKS API calls inside `Eventually` or `Retry` statements to avoid faulty E2E test results due to transient network issues. Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com>
1 parent 6baab19 commit 560725d

23 files changed

+284
-149
lines changed

tests/e2e/asserts_test.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -551,11 +551,13 @@ func AssertNewPrimary(namespace string, clusterName string, oldPrimary string) {
551551
}
552552

553553
func AssertStorageCredentialsAreCreated(namespace string, name string, id string, key string) {
554-
_, _, err := testsUtils.Run(fmt.Sprintf("kubectl create secret generic %v -n %v "+
555-
"--from-literal='ID=%v' "+
556-
"--from-literal='KEY=%v'",
557-
name, namespace, id, key))
558-
Expect(err).ToNot(HaveOccurred())
554+
Eventually(func() error {
555+
_, _, err := testsUtils.Run(fmt.Sprintf("kubectl create secret generic %v -n %v "+
556+
"--from-literal='ID=%v' "+
557+
"--from-literal='KEY=%v'",
558+
name, namespace, id, key))
559+
return err
560+
}, 60, 5).Should(BeNil())
559561
}
560562

561563
// AssertArchiveWalOnMinio to archive walls and verify that exists
@@ -735,7 +737,8 @@ func AssertWritesToReplicaFails(
735737
Expect(value, err).To(Equal("t"))
736738

737739
// Expect to be in a read-only transaction
738-
_, _, err = env.ExecCommand(env.Ctx, *connectingPod, specs.PostgresContainerName, &timeout,
740+
_, _, err = utils.ExecCommand(env.Ctx, env.Interface, env.RestClientConfig, *connectingPod,
741+
specs.PostgresContainerName, &timeout,
739742
"psql", dsn, "-tAc", "CREATE TABLE table1(var1 text);")
740743
Expect(err).To(HaveOccurred())
741744
Expect(err.Error()).Should(
@@ -2015,7 +2018,7 @@ func CreateResourceFromFile(namespace, sampleFilePath string) {
20152018
return err
20162019
}
20172020
return nil
2018-
}, 60, 5).Should(BeNil())
2021+
}, RetryTimeout, PollingTime).Should(BeNil())
20192022
}
20202023

20212024
func AssertBackupConditionInClusterStatus(namespace, clusterName string) {

tests/e2e/configuration_update_test.go

Lines changed: 47 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
apiv1 "github.com/EnterpriseDB/cloud-native-postgresql/api/v1"
1717
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/specs"
18+
devUtils "github.com/EnterpriseDB/cloud-native-postgresql/pkg/utils"
1819
"github.com/EnterpriseDB/cloud-native-postgresql/tests"
1920
"github.com/EnterpriseDB/cloud-native-postgresql/tests/utils"
2021

@@ -24,12 +25,42 @@ import (
2425

2526
var _ = Describe("Configuration update", func() {
2627
const (
27-
clusterName = "postgresql-storage-class"
28-
namespace = "cluster-update-config-e2e"
29-
sampleFile = fixturesDir + "/base/cluster-storage-class.yaml"
30-
level = tests.High
28+
clusterName = "postgresql-storage-class"
29+
namespace = "cluster-update-config-e2e"
30+
sampleFile = fixturesDir + "/base/cluster-storage-class.yaml"
31+
level = tests.High
32+
autoVacuumMaxWorkers = 4
3133
)
3234

35+
checkErrorOutFixedAndBlockedConfigurationParameter := func(sample string) {
36+
// Update the configuration
37+
Eventually(func() error {
38+
_, _, err := utils.RunUnchecked("kubectl apply -n " + namespace + " -f " + sample)
39+
return err
40+
// Expecting an error when a blockedConfigurationParameter is modified
41+
}, RetryTimeout, PollingTime).ShouldNot(BeNil())
42+
43+
podList, err := env.GetClusterPodList(namespace, clusterName)
44+
Expect(err).ToNot(HaveOccurred())
45+
46+
const timeout = 60
47+
commandTimeout := time.Second * 2
48+
// Expect other config parameters applied together with a blockedParameter to not have changed
49+
for idx := range podList.Items {
50+
pod := podList.Items[idx]
51+
Eventually(func(g Gomega) int {
52+
stdout, _, err := env.ExecCommand(env.Ctx, pod, specs.PostgresContainerName, &commandTimeout,
53+
"psql", "-U", "postgres", "-tAc", "show autovacuum_max_workers")
54+
g.Expect(err).ToNot(HaveOccurred())
55+
56+
value, atoiErr := strconv.Atoi(strings.Trim(stdout, "\n"))
57+
g.Expect(atoiErr).ToNot(HaveOccurred())
58+
59+
return value
60+
}, timeout).ShouldNot(BeEquivalentTo(autoVacuumMaxWorkers))
61+
}
62+
}
63+
3364
BeforeEach(func() {
3465
if testLevelEnv.Depth < int(level) {
3566
Skip("Test depth is lower than the amount requested for this test")
@@ -80,8 +111,15 @@ var _ = Describe("Configuration update", func() {
80111
// Connection should fail now because we are not supplying a password
81112
podList, err := env.GetClusterPodList(namespace, clusterName)
82113
Expect(err).ToNot(HaveOccurred())
83-
stdout, _, err := env.ExecCommand(env.Ctx, podList.Items[0], specs.PostgresContainerName, &commandtimeout,
84-
"psql", "-U", "postgres", "-h", endpointName, "-tAc", "select 1")
114+
stdout, _, err := devUtils.ExecCommand(
115+
env.Ctx,
116+
env.Interface,
117+
env.RestClientConfig,
118+
podList.Items[0],
119+
specs.PostgresContainerName,
120+
&commandtimeout,
121+
"psql", "-U", "postgres", "-h", endpointName, "-tAc", "select 1",
122+
)
85123
Expect(err).To(HaveOccurred())
86124
// Update the configuration
87125
CreateResourceFromFile(namespace, sample)
@@ -180,45 +218,11 @@ var _ = Describe("Configuration update", func() {
180218
})
181219
By("Erroring out when a fixedConfigurationParameter is modified", func() {
182220
sample := fixturesDir + "/config_update/05-fixed-params.yaml"
183-
// Update the configuration
184-
_, _, err := utils.RunUnchecked("kubectl apply -n " + namespace + " -f " + sample)
185-
// Expecting an error when a fixedConfigurationParameter is modified
186-
Expect(err).To(HaveOccurred())
187-
podList, err := env.GetClusterPodList(namespace, clusterName)
188-
Expect(err).ToNot(HaveOccurred())
189-
timeout := 60
190-
commandtimeout := time.Second * 2
191-
// Expect other config parameters applied together with a fixedParameter to not have changed
192-
for _, pod := range podList.Items {
193-
pod := pod // pin the variable
194-
Eventually(func() (int, error, error) {
195-
stdout, _, err := env.ExecCommand(env.Ctx, pod, specs.PostgresContainerName, &commandtimeout,
196-
"psql", "-U", "postgres", "-tAc", "show autovacuum_max_workers")
197-
value, atoiErr := strconv.Atoi(strings.Trim(stdout, "\n"))
198-
return value, err, atoiErr
199-
}, timeout).ShouldNot(BeEquivalentTo(4))
200-
}
221+
checkErrorOutFixedAndBlockedConfigurationParameter(sample)
201222
})
202223
By("Erroring out when a blockedConfigurationParameter is modified", func() {
203224
sample := fixturesDir + "/config_update/06-blocked-params.yaml"
204-
// Update the configuration
205-
_, _, err := utils.RunUnchecked("kubectl apply -n " + namespace + " -f " + sample)
206-
// Expecting an error when a blockedConfigurationParameter is modified
207-
Expect(err).To(HaveOccurred())
208-
podList, err := env.GetClusterPodList(namespace, clusterName)
209-
Expect(err).ToNot(HaveOccurred())
210-
timeout := 60
211-
commandtimeout := time.Second * 2
212-
// Expect other config parameters applied together with a blockedParameter to not have changed
213-
for _, pod := range podList.Items {
214-
pod := pod
215-
Eventually(func() (int, error, error) {
216-
stdout, _, err := env.ExecCommand(env.Ctx, pod, specs.PostgresContainerName, &commandtimeout,
217-
"psql", "-U", "postgres", "-tAc", "show autovacuum_max_workers")
218-
value, atoiErr := strconv.Atoi(strings.Trim(stdout, "\n"))
219-
return value, err, atoiErr
220-
}, timeout).ShouldNot(BeEquivalentTo(4))
221-
}
225+
checkErrorOutFixedAndBlockedConfigurationParameter(sample)
222226
})
223227

224228
// nolint:dupl
@@ -238,8 +242,7 @@ var _ = Describe("Configuration update", func() {
238242
Expect(cluster.Status.CurrentPrimary, err).To(BeEquivalentTo(cluster.Status.TargetPrimary))
239243
oldPrimary := cluster.Status.CurrentPrimary
240244
// Update the configuration
241-
_, _, err = utils.Run("kubectl apply -n " + namespace + " -f " + sample)
242-
Expect(err).ToNot(HaveOccurred())
245+
CreateResourceFromFile(namespace, sample)
243246
timeout := 300
244247
commandtimeout := time.Second * 2
245248
// Check that the new parameter has been modified in every pod

tests/e2e/eviction_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,6 @@ var _ = Describe("Pod eviction", Serial, Label(tests.LabelDisruptive), func() {
9999
if testLevelEnv.Depth < int(level) {
100100
Skip("Test depth is lower than the amount requested for this test")
101101
}
102-
// limit the case running on local kind env as we are using taint to simulate the eviction
103-
// we do not know if other cloud vendor crd controller is running on the node been evicted
104-
isIBM := env.IsIBM()
105-
isAKS, _ := env.IsAKS()
106-
isGKE, _ := env.IsGKE()
107-
if isIBM || isAKS || isGKE {
108-
Skip("Test runs only on local")
109-
}
110102
})
111103
JustAfterEach(func() {
112104
clusterName, err := env.GetResourceNameFromYAML(singleInstanceSampleFile)
@@ -117,6 +109,14 @@ var _ = Describe("Pod eviction", Serial, Label(tests.LabelDisruptive), func() {
117109
}
118110
})
119111
BeforeAll(func() {
112+
// limit the case running on local kind env as we are using taint to simulate the eviction
113+
// we do not know if other cloud vendor crd controller is running on the node been evicted
114+
isIBM := env.IsIBM()
115+
isAKS, _ := env.IsAKS()
116+
isGKE, _ := env.IsGKE()
117+
if isIBM || isAKS || isGKE {
118+
Skip("Test runs only on local")
119+
}
120120
namespace = "single-instance-pod-eviction"
121121
err := env.CreateNamespace(namespace)
122122
Expect(err).ToNot(HaveOccurred())

tests/e2e/logs_test.go

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
apiv1 "github.com/EnterpriseDB/cloud-native-postgresql/api/v1"
1919
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/management/postgres/logpipe"
2020
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/specs"
21+
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/utils"
2122
"github.com/EnterpriseDB/cloud-native-postgresql/tests"
2223
testsUtils "github.com/EnterpriseDB/cloud-native-postgresql/tests/utils"
2324

@@ -51,8 +52,8 @@ var _ = Describe("JSON log output", func() {
5152
clusterName = "postgresql-json-logs"
5253
const sampleFile = fixturesDir + "/json_logs/cluster-json-logs.yaml"
5354
// Create a cluster in a namespace we'll delete after the test
54-
err := env.CreateNamespace(namespace)
55-
Expect(err).ToNot(HaveOccurred())
55+
namespaceErr := env.CreateNamespace(namespace)
56+
Expect(namespaceErr).ToNot(HaveOccurred())
5657
AssertCreateCluster(namespace, clusterName, sampleFile, env)
5758

5859
By("verifying the presence of possible logger values", func() {
@@ -79,24 +80,29 @@ var _ = Describe("JSON log output", func() {
7980
timeout := 300
8081

8182
for _, pod := range podList.Items {
83+
var queryError error
8284
// Run a wrong query and save its result
8385
commandTimeout := time.Second * 5
84-
_, _, err = env.ExecCommand(env.Ctx, pod, specs.PostgresContainerName,
85-
&commandTimeout, "psql", "-U", "postgres", "app", "-tAc", errorTestQuery)
86-
Expect(err).To(HaveOccurred())
87-
expectedResult := err.Error()
86+
Eventually(func(g Gomega) error {
87+
_, _, queryError = utils.ExecCommand(env.Ctx, env.Interface, env.RestClientConfig, pod,
88+
specs.PostgresContainerName, &commandTimeout, "psql", "-U", "postgres", "app", "-tAc",
89+
errorTestQuery)
90+
return queryError
91+
}, RetryTimeout, PollingTime).ShouldNot(BeNil())
8892

8993
// Eventually the error log line will be logged
90-
Eventually(func() (bool, error) {
94+
Eventually(func(g Gomega) bool {
9195
// Gather pod logs in the form of a Json Array
9296
logEntries, err := testsUtils.ParseJSONLogs(namespace, pod.GetName(), env)
93-
if err != nil {
94-
return false, err
95-
}
97+
g.Expect(err).ToNot(HaveOccurred())
9698

9799
// Gather the record containing the wrong query result
98-
return testsUtils.AssertQueryRecord(logEntries, errorTestQuery, expectedResult,
99-
logpipe.LoggingCollectorRecordName), nil
100+
return testsUtils.AssertQueryRecord(
101+
logEntries,
102+
errorTestQuery,
103+
queryError.Error(),
104+
logpipe.LoggingCollectorRecordName,
105+
)
100106
}, timeout).Should(BeTrue())
101107
}
102108
})
@@ -106,12 +112,15 @@ var _ = Describe("JSON log output", func() {
106112
primaryPod, _ := env.GetClusterPrimary(namespace, clusterName)
107113
timeout := 300
108114

115+
var queryError error
109116
// Run a wrong query on just the primary and save its result
110117
commandTimeout := time.Second * 5
111-
_, _, err = env.ExecCommand(env.Ctx, *primaryPod, specs.PostgresContainerName,
112-
&commandTimeout, "psql", "-U", "postgres", "app", "-tAc", errorTestQuery)
113-
Expect(err).To(HaveOccurred())
114-
expectedResult := err.Error()
118+
Eventually(func() error {
119+
_, _, queryError = utils.ExecCommand(env.Ctx, env.Interface, env.RestClientConfig,
120+
*primaryPod, specs.PostgresContainerName,
121+
&commandTimeout, "psql", "-U", "postgres", "app", "-tAc", errorTestQuery)
122+
return queryError
123+
}, RetryTimeout, PollingTime).ShouldNot(BeNil())
115124

116125
// Expect the query to be eventually logged on the primary
117126
Eventually(func() (bool, error) {
@@ -123,17 +132,17 @@ var _ = Describe("JSON log output", func() {
123132
}
124133

125134
// Gather the record containing the wrong query result
126-
return testsUtils.AssertQueryRecord(logEntries, errorTestQuery, expectedResult,
135+
return testsUtils.AssertQueryRecord(logEntries, errorTestQuery, queryError.Error(),
127136
logpipe.LoggingCollectorRecordName), nil
128137
}, timeout).Should(BeTrue())
129138

130139
// Retrieve cluster replicas
131140
podList := &corev1.PodList{}
132-
err = env.Client.List(
141+
listError := env.Client.List(
133142
env.Ctx, podList, client.InNamespace(namespace),
134143
client.MatchingLabels{"postgresql": clusterName, "role": "replica"},
135144
)
136-
Expect(err).ToNot(HaveOccurred())
145+
Expect(listError).ToNot(HaveOccurred())
137146

138147
// Expect the query not to be logged on replicas
139148
for _, pod := range podList.Items {
@@ -143,8 +152,14 @@ var _ = Describe("JSON log output", func() {
143152
Expect(len(logEntries) > 0).To(BeTrue())
144153

145154
// No record should be returned in this case
146-
Expect(testsUtils.AssertQueryRecord(logEntries, expectedResult, errorTestQuery,
147-
logpipe.LoggingCollectorRecordName)).Should(BeFalse())
155+
isQueryRecordContained := testsUtils.AssertQueryRecord(
156+
logEntries,
157+
queryError.Error(),
158+
errorTestQuery,
159+
logpipe.LoggingCollectorRecordName,
160+
)
161+
162+
Expect(isQueryRecordContained).Should(BeFalse())
148163
}
149164
})
150165

@@ -155,8 +170,9 @@ var _ = Describe("JSON log output", func() {
155170
forceDelete := &client.DeleteOptions{
156171
GracePeriodSeconds: &zero,
157172
}
158-
err = env.DeletePod(namespace, currentPrimary.GetName(), forceDelete)
159-
Expect(err).ToNot(HaveOccurred())
173+
174+
deletePodError := env.DeletePod(namespace, currentPrimary.GetName(), forceDelete)
175+
Expect(deletePodError).ToNot(HaveOccurred())
160176

161177
// Expect a new primary to be elected
162178
timeout := 180

tests/e2e/rolling_update_test.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212

1313
corev1 "k8s.io/api/core/v1"
1414
"k8s.io/apimachinery/pkg/types"
15-
"k8s.io/client-go/util/retry"
1615

1716
apiv1 "github.com/EnterpriseDB/cloud-native-postgresql/api/v1"
1817
"github.com/EnterpriseDB/cloud-native-postgresql/internal/configuration"
@@ -76,13 +75,13 @@ var _ = Describe("Rolling updates", func() {
7675
Namespace: namespace,
7776
Name: clusterName,
7877
}
79-
err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
78+
Eventually(func(g Gomega) error {
8079
err := env.Client.Get(env.Ctx, namespacedName, cluster)
81-
Expect(err).ToNot(HaveOccurred())
80+
g.Expect(err).ToNot(HaveOccurred())
81+
8282
cluster.Spec.ImageName = updatedImageName
8383
return env.Client.Update(env.Ctx, cluster)
84-
})
85-
Expect(err).ToNot(HaveOccurred())
84+
}, RetryTimeout, PollingTime).Should(BeNil())
8685

8786
// All the postgres containers should have the updated image
8887
Eventually(func() (int32, error) {

tests/e2e/suite_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ import (
2525
const (
2626
samplesDir = "../../docs/src/samples"
2727
fixturesDir = "./fixtures"
28+
// RetryTimeout retry time when client api call or kubectl cli request get failed
29+
RetryTimeout = 60
30+
// PollingTime polling between retry
31+
PollingTime = 5
2832
)
2933

3034
var (

tests/e2e/syncreplicas_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414

1515
corev1 "k8s.io/api/core/v1"
1616
"k8s.io/apimachinery/pkg/types"
17-
"k8s.io/client-go/util/retry"
1817

1918
clusterv1 "github.com/EnterpriseDB/cloud-native-postgresql/api/v1"
2019
"github.com/EnterpriseDB/cloud-native-postgresql/pkg/specs"
@@ -89,16 +88,15 @@ var _ = Describe("Synchronous Replicas", func() {
8988
Namespace: namespace,
9089
Name: clusterName,
9190
}
92-
9391
// Set MaxSyncReplicas to 1
94-
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
92+
Eventually(func(g Gomega) error {
9593
cluster := &clusterv1.Cluster{}
9694
err := env.Client.Get(env.Ctx, namespacedName, cluster)
97-
Expect(err).ToNot(HaveOccurred())
95+
g.Expect(err).ToNot(HaveOccurred())
96+
9897
cluster.Spec.MaxSyncReplicas = 1
9998
return env.Client.Update(env.Ctx, cluster)
100-
})
101-
Expect(err).ToNot(HaveOccurred())
99+
}, 60, 5).Should(BeNil())
102100

103101
// Scale the cluster down to 2 pods
104102
_, _, err := utils.Run(fmt.Sprintf("kubectl scale --replicas=2 -n %v cluster/%v", namespace, clusterName))

0 commit comments

Comments
 (0)