Skip to content

Commit e807509

Browse files
authored
enhance(jindocache): recover jindocache master after unexpected crashes (#5553)
* feat(engine): add dataset mount sync interface and default implementation - Define ShouldSyncDatasetMounts and SyncDatasetMounts methods in engine interface - Implement default ShouldSyncDatasetMounts and SyncDatasetMounts in all engines returning no-op Signed-off-by: TzZtzt <trafalgarz@outlook.com> * enhance(jindocache): support jindocache repreparing UFS to recover cache engine from crashes Signed-off-by: TzZtzt <trafalgarz@outlook.com> * fix unit tests Signed-off-by: TzZtzt <trafalgarz@outlook.com> * fix comments Signed-off-by: TzZtzt <trafalgarz@outlook.com> * fix comments Signed-off-by: TzZtzt <trafalgarz@outlook.com> * chore(git): add .qoder directory to .gitignore - Prevents accidental commits of .qoder files Signed-off-by: TzZtzt <trafalgarz@outlook.com> --------- Signed-off-by: TzZtzt <trafalgarz@outlook.com>
1 parent 0e16600 commit e807509

File tree

14 files changed

+200
-1
lines changed

14 files changed

+200
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ bin
2424
*.swo
2525
*~
2626
.vscode
27+
.qoder
2728

2829
**/*.tgz
2930
**/.DS_Store

pkg/ddc/alluxio/ufs.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,11 @@ func (e *AlluxioEngine) checkIfRemountRequired(ufsToUpdate *utils.UFSToUpdate) {
201201
}
202202
}
203203
}
204+
205+
func (e *AlluxioEngine) ShouldSyncDatasetMounts() (should bool, err error) {
206+
return false, nil
207+
}
208+
209+
func (e *AlluxioEngine) SyncDatasetMounts() (err error) {
210+
return nil
211+
}

pkg/ddc/base/engine.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,13 @@ type Implement interface {
9696
// PrepareUFS prepare the mounts and metadata if it's not ready
9797
PrepareUFS() (err error)
9898

99+
// ShouldSyncDatasetMounts check if we need to sync the dataset mounts
100+
ShouldSyncDatasetMounts() (should bool, err error)
101+
102+
// SyncDatasetMounts sync the mounts in Dataset's spec into cache engine.
103+
// The func should not only handle mounts changes in the Dataset's spec, but also handle cases where a cache engine lose some mount info because of unexpected crashes.
104+
SyncDatasetMounts() (err error)
105+
99106
// ShouldUpdateUFS check if we need to update the ufs and return all ufs to update
100107
// If the ufs have changed and the engine supports add/remove mount points dynamically,
101108
// then we need to UpdateOnUFSChange

pkg/ddc/base/mock/mock_engine.go

Lines changed: 31 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/ddc/base/syncs.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,21 @@ func (t *TemplateEngine) Sync(ctx cruntime.ReconcileRequestContext) (err error)
8181
return
8282
}
8383

84-
// 6. Update dataset mount point
84+
// 6. Sync dataset mounts
85+
// TODO: SyncDatasetMounts() and UpdateUFS() should be merged in future refactoring as they describe a similar workflow
86+
var shouldSyncDatasetMounts bool
87+
shouldSyncDatasetMounts, err = t.Implement.ShouldSyncDatasetMounts()
88+
if err != nil {
89+
return
90+
}
91+
if shouldSyncDatasetMounts {
92+
err = t.Implement.SyncDatasetMounts()
93+
if err != nil {
94+
return
95+
}
96+
}
97+
98+
// 7. Update dataset mount point
8599
if permitSyncEngineStatus {
86100
ufsToUpdate := t.Implement.ShouldUpdateUFS()
87101
if ufsToUpdate != nil {

pkg/ddc/base/template_engine_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ var _ = Describe("TemplateEngine", func() {
131131
impl.EXPECT().CheckRuntimeHealthy().Return(nil).Times(1),
132132
impl.EXPECT().CheckAndUpdateRuntimeStatus().Return(true, nil).Times(1),
133133
impl.EXPECT().UpdateCacheOfDataset().Return(nil).Times(1),
134+
impl.EXPECT().ShouldSyncDatasetMounts().Return(false, nil).Times(1),
134135
impl.EXPECT().ShouldUpdateUFS().Return(&utils.UFSToUpdate{}).Times(1),
135136
impl.EXPECT().SyncScheduleInfoToCacheNodes().Return(nil).Times(1),
136137
)
@@ -166,6 +167,8 @@ var _ = Describe("TemplateEngine", func() {
166167
impl.EXPECT().CheckRuntimeHealthy().Return(nil).Times(1),
167168
impl.EXPECT().CheckAndUpdateRuntimeStatus().Return(true, nil).Times(1),
168169
impl.EXPECT().UpdateCacheOfDataset().Return(nil).Times(1),
170+
impl.EXPECT().ShouldSyncDatasetMounts().Return(true, nil).Times(1),
171+
impl.EXPECT().SyncDatasetMounts().Return(nil).Times(1),
169172
impl.EXPECT().ShouldUpdateUFS().Return(ufsToUpdate).Times(1),
170173
impl.EXPECT().UpdateOnUFSChange(ufsToUpdate).Times(1),
171174
impl.EXPECT().SyncScheduleInfoToCacheNodes().Return(nil).Times(1),

pkg/ddc/efc/ufs.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,11 @@ func (e *EFCEngine) ShouldUpdateUFS() (ufsToUpdate *utils.UFSToUpdate) {
9494
func (e *EFCEngine) UpdateOnUFSChange(ufsToUpdate *utils.UFSToUpdate) (ready bool, err error) {
9595
return true, nil
9696
}
97+
98+
func (e *EFCEngine) ShouldSyncDatasetMounts() (should bool, err error) {
99+
return false, nil
100+
}
101+
102+
func (e *EFCEngine) SyncDatasetMounts() (err error) {
103+
return nil
104+
}

pkg/ddc/goosefs/ufs.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,11 @@ func (e *GooseFSEngine) UpdateOnUFSChange(ufsToUpdate *utils.UFSToUpdate) (updat
119119
updateReady = true
120120
return
121121
}
122+
123+
func (e *GooseFSEngine) ShouldSyncDatasetMounts() (should bool, err error) {
124+
return false, nil
125+
}
126+
127+
func (e *GooseFSEngine) SyncDatasetMounts() (err error) {
128+
return nil
129+
}

pkg/ddc/jindo/ufs.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,11 @@ func (e *JindoEngine) ShouldUpdateUFS() (ufsToUpdate *utils.UFSToUpdate) {
6969
func (e *JindoEngine) UpdateOnUFSChange(*utils.UFSToUpdate) (updateReady bool, err error) {
7070
return
7171
}
72+
73+
func (e *JindoEngine) ShouldSyncDatasetMounts() (should bool, err error) {
74+
return false, nil
75+
}
76+
77+
func (e *JindoEngine) SyncDatasetMounts() (err error) {
78+
return nil
79+
}

pkg/ddc/jindocache/ufs.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,16 @@ limitations under the License.
1717
package jindocache
1818

1919
import (
20+
"context"
2021
"fmt"
22+
"reflect"
2123

2224
"github.com/fluid-cloudnative/fluid/pkg/ddc/jindocache/operations"
2325
"github.com/fluid-cloudnative/fluid/pkg/utils"
26+
"github.com/fluid-cloudnative/fluid/pkg/utils/kubeclient"
27+
"github.com/pkg/errors"
28+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/client-go/util/retry"
2430
)
2531

2632
// ShouldCheckUFS checks if it requires checking UFS
@@ -117,3 +123,76 @@ func (e *JindoCacheEngine) ShouldUpdateUFS() (ufsToUpdate *utils.UFSToUpdate) {
117123
func (e *JindoCacheEngine) UpdateOnUFSChange(*utils.UFSToUpdate) (updateReady bool, err error) {
118124
return
119125
}
126+
127+
func (e *JindoCacheEngine) ShouldSyncDatasetMounts() (should bool, err error) {
128+
runtime, err := utils.GetJindoRuntime(e.Client, e.name, e.namespace)
129+
if err != nil {
130+
e.Log.Error(err, "failed to get runtime when if dataset mounts need to be synced")
131+
return false, errors.WithMessage(err, "failed to get runtime when checking if dataset mounts need to be synced")
132+
}
133+
134+
if runtime.Spec.Master.Disabled {
135+
return false, nil
136+
}
137+
138+
masterPodName, masterContainerName := e.getMasterPodInfo()
139+
masterPod, err := kubeclient.GetPodByName(e.Client, masterPodName, e.namespace)
140+
if err != nil || masterPod == nil {
141+
e.Log.Error(err, "failed to get master pod when checking if dataset mounts need to be synced")
142+
return false, errors.WithMessage(err, "failed to get master pod when checking if dataset mounts need to be synced")
143+
}
144+
145+
var startedAt *metav1.Time
146+
for _, containerStatus := range masterPod.Status.ContainerStatuses {
147+
if containerStatus.Name == masterContainerName {
148+
if containerStatus.State.Running == nil {
149+
e.Log.Info("Jindocache master container is not running, recheck its status in next reconciliation loop")
150+
return false, nil
151+
} else {
152+
startedAt = &containerStatus.State.Running.StartedAt
153+
break
154+
}
155+
}
156+
}
157+
158+
if startedAt == nil {
159+
e.Log.Info("Jindocache master container not found in pod container statuses when checking if dataset mounts need to be synced",
160+
"pod", masterPod.Name, "namespace", masterPod.Namespace, "container", masterContainerName)
161+
}
162+
163+
// either runtime.Status.MountTime is not set (for backward compatibility) or runtime.Status.MountTime is earlier than startedAt (i.e. jindocache master is restarted), we need to reprepare UFS
164+
needReprepareUFS := runtime.Status.MountTime == nil || (startedAt != nil && runtime.Status.MountTime.Before(startedAt))
165+
166+
return needReprepareUFS, nil
167+
}
168+
169+
func (e *JindoCacheEngine) SyncDatasetMounts() (err error) {
170+
// remount Dataset.spec.mounts and refresh cachesets
171+
if err = e.PrepareUFS(); err != nil {
172+
return err
173+
}
174+
175+
// update runtime.Status.MountTime to indicate that the Dataset.spec.mounts has been synced
176+
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
177+
runtime, err := utils.GetJindoRuntime(e.Client, e.name, e.namespace)
178+
if err != nil {
179+
return err
180+
}
181+
182+
runtimeToUpdate := runtime.DeepCopy()
183+
nowTime := metav1.Now()
184+
runtimeToUpdate.Status.MountTime = &nowTime
185+
186+
if !reflect.DeepEqual(runtime.Status, runtimeToUpdate.Status) {
187+
return e.Client.Status().Update(context.TODO(), runtimeToUpdate)
188+
}
189+
190+
return nil
191+
})
192+
193+
if err != nil {
194+
return err
195+
}
196+
197+
return nil
198+
}

0 commit comments

Comments
 (0)