Skip to content

Commit 4f4cd96

Browse files
litaocdlYanniHu1996armrugbartolini
authored
feat: change default value of startDelay to 3600 (cloudnative-pg#2847)
IMPORTANT: The default value of `startDelay` has now been changed to 3600 seconds, equivalent of 1 hour. Previously, it was set to 30 seconds. This patch also replaces the livenessProbe's initial delay with a more proper Kubernetes startup probe to deal with the start of a Postgres server. Both the startup probe and the higher default time should improve the self-healing capabilities of larger Postgres deployments where the default value of 30 seconds was not sufficient for example to complete a crash recovery operation, causing infinite restarts. Closes cloudnative-pg#2843 Signed-off-by: Tao Li <tao.li@enterprisedb.com> Signed-off-by: YanniHu1996 <yantian.hu@enterprisedb.com> Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com> Co-authored-by: YanniHu1996 <yantian.hu@enterprisedb.com> Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
1 parent 9565f9f commit 4f4cd96

File tree

5 files changed

+57
-14
lines changed

5 files changed

+57
-14
lines changed

api/v1/cluster_types.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,10 @@ type ClusterSpec struct {
265265
WalStorage *StorageConfiguration `json:"walStorage,omitempty"`
266266

267267
// The time in seconds that is allowed for a PostgreSQL instance to
268-
// successfully start up (default 30)
269-
// +kubebuilder:default:=30
268+
// successfully start up (default 3600).
269+
// The startup probe failure threshold is derived from this value using the formula:
270+
// ceiling(startDelay / 10).
271+
// +kubebuilder:default:=3600
270272
// +optional
271273
MaxStartDelay int32 `json:"startDelay,omitempty"`
272274

@@ -981,6 +983,11 @@ const (
981983
// is gracefully shutdown during a switchover.
982984
// It is greater than one year in seconds, big enough to simulate an infinite timeout
983985
DefaultMaxSwitchoverDelay = 3600
986+
987+
// DefaultStartupDelay is the default value for startupDelay, startupDelay will be used to calculate the
988+
// FailureThreshold of startupProbe, the formula is `FailureThreshold = ceiling(startDelay / periodSeconds)`,
989+
// the minimum value is 1
990+
DefaultStartupDelay = 3600
984991
)
985992

986993
// PostgresConfiguration defines the PostgreSQL configuration
@@ -2381,7 +2388,7 @@ func (cluster *Cluster) GetMaxStartDelay() int32 {
23812388
if cluster.Spec.MaxStartDelay > 0 {
23822389
return cluster.Spec.MaxStartDelay
23832390
}
2384-
return 30
2391+
return DefaultStartupDelay
23852392
}
23862393

23872394
// GetMaxStopDelay get the amount of time PostgreSQL has to stop

config/crd/bases/postgresql.cnpg.io_clusters.yaml

+5-3
Original file line numberDiff line numberDiff line change
@@ -3010,9 +3010,11 @@ spec:
30103010
- metadata
30113011
type: object
30123012
startDelay:
3013-
default: 30
3014-
description: The time in seconds that is allowed for a PostgreSQL
3015-
instance to successfully start up (default 30)
3013+
default: 3600
3014+
description: 'The time in seconds that is allowed for a PostgreSQL
3015+
instance to successfully start up (default 3600). The startup probe
3016+
failure threshold is derived from this value using the formula:
3017+
ceiling(startDelay / 10).'
30163018
format: int32
30173019
type: integer
30183020
stopDelay:

docs/src/cloudnative-pg.v1.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1451,7 +1451,9 @@ user by setting it to <code>NULL</code>. Enabled by default.</p>
14511451
</td>
14521452
<td>
14531453
<p>The time in seconds that is allowed for a PostgreSQL instance to
1454-
successfully start up (default 30)</p>
1454+
successfully start up (default 3600)
1455+
The startup probe failure threshold is derived from this value using the formula:
1456+
ceiling(startDelay / 10).</p>
14551457
</td>
14561458
</tr>
14571459
<tr><td><code>stopDelay</code><br/>

pkg/specs/pods.go

+29-7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ package specs
2121
import (
2222
"encoding/json"
2323
"fmt"
24+
"math"
2425
"reflect"
2526
"strconv"
2627

@@ -81,6 +82,12 @@ const (
8182

8283
// ReadinessProbePeriod is the period set for the postgres instance readiness probe
8384
ReadinessProbePeriod = 10
85+
86+
// StartupProbePeriod is the period set for the postgres instance startup probe
87+
StartupProbePeriod = 10
88+
89+
// LivenessProbePeriod is the period set for the postgres instance liveness probe
90+
LivenessProbePeriod = 10
8491
)
8592

8693
// EnvConfig carries the environment configuration of a container
@@ -184,6 +191,17 @@ func createPostgresContainers(cluster apiv1.Cluster, envConfig EnvConfig) []core
184191
Env: envConfig.EnvVars,
185192
EnvFrom: envConfig.EnvFrom,
186193
VolumeMounts: createPostgresVolumeMounts(cluster),
194+
StartupProbe: &corev1.Probe{
195+
FailureThreshold: getStartupProbeFailureThreshold(cluster.GetMaxStartDelay()),
196+
PeriodSeconds: StartupProbePeriod,
197+
TimeoutSeconds: 5,
198+
ProbeHandler: corev1.ProbeHandler{
199+
HTTPGet: &corev1.HTTPGetAction{
200+
Path: url.PathHealth,
201+
Port: intstr.FromInt32(int32(url.StatusPort)),
202+
},
203+
},
204+
},
187205
ReadinessProbe: &corev1.Probe{
188206
TimeoutSeconds: 5,
189207
PeriodSeconds: ReadinessProbePeriod,
@@ -194,14 +212,9 @@ func createPostgresContainers(cluster apiv1.Cluster, envConfig EnvConfig) []core
194212
},
195213
},
196214
},
197-
// From K8s 1.17 and newer, startup probes will be available for
198-
// all users and not just protected from feature gates. For now
199-
// let's use the LivenessProbe. When we will drop support for K8s
200-
// 1.16, we'll configure a StartupProbe and this will lead to a
201-
// better LivenessProbe (without InitialDelaySeconds).
202215
LivenessProbe: &corev1.Probe{
203-
InitialDelaySeconds: cluster.GetMaxStartDelay(),
204-
TimeoutSeconds: 5,
216+
PeriodSeconds: LivenessProbePeriod,
217+
TimeoutSeconds: 5,
205218
ProbeHandler: corev1.ProbeHandler{
206219
HTTPGet: &corev1.HTTPGetAction{
207220
Path: url.PathHealth,
@@ -241,6 +254,15 @@ func createPostgresContainers(cluster apiv1.Cluster, envConfig EnvConfig) []core
241254
return containers
242255
}
243256

257+
// getStartupProbeFailureThreshold get the startup probe failure threshold
258+
// FAILURE_THRESHOLD = ceil(startDelay / periodSeconds) and minimum value is 1
259+
func getStartupProbeFailureThreshold(startupDelay int32) int32 {
260+
if startupDelay <= StartupProbePeriod {
261+
return 1
262+
}
263+
return int32(math.Ceil(float64(startupDelay) / float64(StartupProbePeriod)))
264+
}
265+
244266
// CreateAffinitySection creates the affinity sections for Pods, given the configuration
245267
// from the user
246268
func CreateAffinitySection(clusterName string, config apiv1.AffinityConfiguration) *corev1.Affinity {

pkg/specs/pods_test.go

+10
Original file line numberDiff line numberDiff line change
@@ -829,3 +829,13 @@ var _ = Describe("PodSpec drift detection", func() {
829829
Expect(specsMatch).To(BeFalse())
830830
})
831831
})
832+
833+
var _ = Describe("Compute startup probe failure threshold", func() {
834+
It("should take the minimum value 1", func() {
835+
Expect(getStartupProbeFailureThreshold(5)).To(BeNumerically("==", 1))
836+
})
837+
838+
It("should take the value from 'startDelay / periodSeconds'", func() {
839+
Expect(getStartupProbeFailureThreshold(109)).To(BeNumerically("==", 11))
840+
})
841+
})

0 commit comments

Comments
 (0)