Skip to content

Commit db986e2

Browse files
authored
[Feature] Change DBServer Cleanup Logic (#1025)
1 parent a9d7849 commit db986e2

File tree

5 files changed

+60
-30
lines changed

5 files changed

+60
-30
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
- (Feature) Add `ArangoDeploymentReplication` CRD auto-installer
2222
- (Bugfix) Allow missing `token` key in License secret
2323
- (Feature) Unify agency access
24+
- (Feature) Change DBServer Cleanup Logic
2425

2526
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
2627
- (Bugfix) Fix arangosync members state inspection

pkg/deployment/agency/current_collections.go

+31
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,41 @@ package agency
2222

2323
type StateCurrentCollections map[string]StateCurrentDBCollections
2424

25+
func (a StateCurrentCollections) IsDBServerPresent(name Server) bool {
26+
for _, v := range a {
27+
if v.IsDBServerPresent(name) {
28+
return true
29+
}
30+
}
31+
32+
return false
33+
}
34+
2535
type StateCurrentDBCollections map[string]StateCurrentDBCollection
2636

37+
func (a StateCurrentDBCollections) IsDBServerPresent(name Server) bool {
38+
for _, v := range a {
39+
if v.IsDBServerPresent(name) {
40+
return true
41+
}
42+
}
43+
44+
return false
45+
}
46+
2747
type StateCurrentDBCollection map[string]StateCurrentDBShard
2848

49+
func (a StateCurrentDBCollection) IsDBServerPresent(name Server) bool {
50+
51+
for _, v := range a {
52+
if v.Servers.Contains(name) {
53+
return true
54+
}
55+
}
56+
57+
return false
58+
}
59+
2960
type StateCurrentDBShard struct {
3061
Servers Servers `json:"servers,omitempty"`
3162
}

pkg/deployment/agency/plan_collections.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ package agency
2222

2323
type StatePlanCollections map[string]StatePlanDBCollections
2424

25-
func (a StatePlanCollections) IsDBServerInDatabases(name Server) bool {
25+
func (a StatePlanCollections) IsDBServerPresent(name Server) bool {
2626
for _, collections := range a {
2727
if collections.IsDBServerInCollections(name) {
2828
return true

pkg/deployment/reconcile/plan_builder_normal.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func (r *Reconciler) createMemberFailedRestorePlan(ctx context.Context, apiObjec
121121
continue
122122
}
123123

124-
if agencyState.Plan.Collections.IsDBServerInDatabases(agency.Server(m.ID)) {
124+
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
125125
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
126126
memberLog.Info("Recreating DBServer - it cannot be removed gracefully")
127127
plan = append(plan,

pkg/deployment/resilience/member_failure.go

+26-28
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ import (
2424
"context"
2525
"time"
2626

27-
"github.com/arangodb/kube-arangodb/pkg/util/globals"
28-
2927
"github.com/arangodb/kube-arangodb/pkg/util/errors"
3028

3129
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
32-
"github.com/arangodb/kube-arangodb/pkg/util/arangod"
30+
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
3331
)
3432

3533
const (
@@ -74,10 +72,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
7472
if m.IsNotReadySince(time.Now().Add(-notReadySinceGracePeriod)) {
7573
// Member has terminated too often in recent history.
7674

77-
failureAcceptable, reason, err := r.isMemberFailureAcceptable(ctx, group, m)
78-
if err != nil {
79-
log.Err(err).Warn("Failed to check is member failure is acceptable")
80-
} else if failureAcceptable {
75+
failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
76+
if failureAcceptable {
8177
log.Info("Member is not ready for long time, marking is failed")
8278
m.Phase = api.MemberPhaseFailed
8379
status.Members.Update(m, group)
@@ -93,10 +89,8 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
9389
count := m.RecentTerminationsSince(time.Now().Add(-recentTerminationsSinceGracePeriod))
9490
if count >= recentTerminationThreshold {
9591
// Member has terminated too often in recent history.
96-
failureAcceptable, reason, err := r.isMemberFailureAcceptable(ctx, group, m)
97-
if err != nil {
98-
log.Err(err).Warn("Failed to check is member failure is acceptable")
99-
} else if failureAcceptable {
92+
failureAcceptable, reason := r.isMemberFailureAcceptable(group, m)
93+
if failureAcceptable {
10094
log.Info("Member has terminated too often in recent history, marking is failed")
10195
m.Phase = api.MemberPhaseFailed
10296
status.Members.Update(m, group)
@@ -123,42 +117,46 @@ func (r *Resilience) CheckMemberFailure(ctx context.Context) error {
123117

124118
// isMemberFailureAcceptable checks if it is currently acceptable to switch the phase of the given member
125119
// to failed, which means that it will be replaced.
126-
// Return: failureAcceptable, notAcceptableReason, error
127-
func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.ServerGroup, m api.MemberStatus) (bool, string, error) {
120+
// Return: failureAcceptable, notAcceptableReason
121+
func (r *Resilience) isMemberFailureAcceptable(group api.ServerGroup, m api.MemberStatus) (bool, string) {
128122

129123
switch group {
130124
case api.ServerGroupAgents:
131125
agencyHealth, ok := r.context.GetAgencyHealth()
132126
if !ok {
133-
return false, "AgencyHealth is not present", nil
127+
return false, "AgencyHealth is not present"
134128
}
135129

136130
if err := agencyHealth.Healthy(); err != nil {
137-
return false, err.Error(), nil
131+
return false, err.Error()
138132
}
139133

140-
return true, "", nil
134+
return true, ""
141135
case api.ServerGroupDBServers:
142-
ctxChild, cancel := globals.GetGlobalTimeouts().ArangoD().WithTimeout(ctx)
143-
defer cancel()
144-
client, err := r.context.GetDatabaseClient(ctxChild)
145-
if err != nil {
146-
return false, "", errors.WithStack(err)
136+
agencyState, ok := r.context.GetAgencyCache()
137+
if !ok {
138+
return false, "AgencyHealth is not present"
139+
}
140+
141+
if agencyState.Plan.Collections.IsDBServerPresent(agency.Server(m.ID)) {
142+
return false, "DBServer still in Plan"
147143
}
148-
if err := arangod.IsDBServerEmpty(ctx, m.ID, client); err != nil {
149-
return false, err.Error(), nil
144+
145+
if agencyState.Current.Collections.IsDBServerPresent(agency.Server(m.ID)) {
146+
return false, "DBServer still in Current"
150147
}
151-
return true, "", nil
148+
149+
return true, ""
152150
case api.ServerGroupCoordinators:
153151
// Coordinators can be replaced at will
154-
return true, "", nil
152+
return true, ""
155153
case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers:
156154
// Sync masters & workers can be replaced at will
157-
return true, "", nil
155+
return true, ""
158156
case api.ServerGroupSingle:
159-
return false, "ServerGroupSingle can not marked as a failed", nil
157+
return false, "ServerGroupSingle can not marked as a failed"
160158
default:
161159
// TODO
162-
return false, "TODO", nil
160+
return false, "TODO"
163161
}
164162
}

0 commit comments

Comments
 (0)