@@ -21,6 +21,7 @@ type lifecycleManager struct {
2121 provider * fractionProvider // provider for fraction operations
2222 flags * StateManager // storage state flags
2323 registry * fractionRegistry // fraction state registry
24+ tasks * TaskManager // Background offloading tasks
2425
2526 sealingWg sync.WaitGroup
2627}
@@ -36,18 +37,26 @@ func newLifecycleManager(
3637 provider : provider ,
3738 flags : flags ,
3839 registry : registry ,
40+ tasks : NewTaskManager (),
3941 }
4042}
4143
4244// Maintain performs periodic lifecycle management tasks.
4345// It coordinates rotation, offloading, cleanup based on configuration.
44- func (lc * lifecycleManager ) Maintain (ctx context.Context , config * Config , wg * sync.WaitGroup ) {
45- lc .rotate (config .FracSize , wg )
46- if config .OffloadingEnabled {
47- lc .offloadLocal (ctx , config .TotalSize , wg )
48- lc .cleanRemote (config .OffloadingRetention , wg )
46+ func (lc * lifecycleManager ) Maintain (ctx context.Context , cfg * Config , wg * sync.WaitGroup ) {
47+
48+ suspendThreshold := cfg .TotalSize + cfg .TotalSize / 100 + cfg .OffloadingQueueSize
49+ lc .registry .SuspendIfOverCapacity (cfg .SealingQueueLen , suspendThreshold )
50+
51+ lc .rotate (cfg .FracSize , wg )
52+ if cfg .OffloadingEnabled {
53+ lc .offloadLocal (ctx , cfg .TotalSize , cfg .OffloadingRetryDelay , wg )
54+ if cfg .OffloadingQueueSize > 0 {
55+ lc .removeOverflowed (cfg .OffloadingQueueSize , wg )
56+ }
57+ lc .cleanRemote (cfg .OffloadingRetention , wg )
4958 } else {
50- lc .cleanLocal (config .TotalSize , wg )
59+ lc .cleanLocal (cfg .TotalSize , wg )
5160 }
5261 lc .updateOldestMetric ()
5362 lc .SyncInfoCache ()
@@ -113,17 +122,18 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) {
113122
114123// offloadLocal starts offloading of local fractions to remote storage.
115124// Selects fractions based on disk space usage and retention policy.
116- func (lc * lifecycleManager ) offloadLocal (ctx context.Context , sizeLimit uint64 , wg * sync.WaitGroup ) {
125+ func (lc * lifecycleManager ) offloadLocal (ctx context.Context , sizeLimit uint64 , retryDelay time. Duration , wg * sync.WaitGroup ) {
117126 toOffload , err := lc .registry .EvictLocal (true , sizeLimit )
118127 if err != nil {
119128 logger .Fatal ("error releasing old fractions:" , zap .Error (err ))
120129 }
121130 for _ , sealed := range toOffload {
122131 wg .Add (1 )
123- go func () {
132+ lc . tasks . Run ( sealed . instance . BaseFileName , ctx , func (ctx context. Context ) {
124133 defer wg .Done ()
125134
126- remote , _ := lc .tryOffload (ctx , sealed .instance )
135+ remote := lc .offloadWithRetry (ctx , sealed .instance , retryDelay )
136+
127137 lc .registry .PromoteToRemote (sealed , remote )
128138
129139 if remote == nil {
@@ -136,7 +146,41 @@ func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64,
136146 // free up local resources
137147 sealed .instance .Suicide ()
138148 maintenanceTruncateTotal .Add (1 )
139- }()
149+ })
150+ }
151+ }
152+
153+ // OffloadWithRetry attempts to offload a fraction with retries until success or cancellation.
154+ // Returns the remote fraction instance and a boolean indicating whether offloading was not canceled.
155+ func (lc * lifecycleManager ) offloadWithRetry (ctx context.Context , sealed * frac.Sealed , retryDelay time.Duration ) * frac.Remote {
156+ start := time .Now ()
157+ for i := 0 ; ; i ++ {
158+ remote , err := lc .tryOffload (ctx , sealed )
159+ if err == nil {
160+ return remote
161+ }
162+
163+ logger .Warn (
164+ "fail to offload fraction" ,
165+ zap .String ("name" , sealed .BaseFileName ),
166+ zap .Duration ("offloading_time" , time .Since (start )),
167+ zap .Int ("attempts" , i ),
168+ zap .Error (err ),
169+ )
170+
171+ select {
172+ case <- ctx .Done ():
173+ logger .Info (
174+ "fraction offloading was stopped" ,
175+ zap .String ("name" , sealed .BaseFileName ),
176+ zap .Duration ("offloading_time" , time .Since (start )),
177+ zap .Int ("attempts" , i ),
178+ zap .Error (ctx .Err ()),
179+ )
180+ return nil
181+ case <- time .After (retryDelay ):
182+ // Wait before next retry attempt
183+ }
140184 }
141185}
142186
@@ -163,9 +207,6 @@ func (lc *lifecycleManager) tryOffload(ctx context.Context, sealed *frac.Sealed)
163207
164208// cleanRemote deletes outdated remote fractions based on retention policy.
165209func (lc * lifecycleManager ) cleanRemote (retention time.Duration , wg * sync.WaitGroup ) {
166- if retention == 0 {
167- return
168- }
169210 toDelete := lc .registry .EvictRemote (retention )
170211 wg .Add (1 )
171212 go func () {
@@ -207,3 +248,18 @@ func (lc *lifecycleManager) updateOldestMetric() {
207248 oldestFracTime .WithLabelValues ("remote" ).Set ((time .Duration (lc .registry .OldestTotal ()) * time .Millisecond ).Seconds ())
208249 oldestFracTime .WithLabelValues ("local" ).Set ((time .Duration (lc .registry .OldestLocal ()) * time .Millisecond ).Seconds ())
209250}
251+
252+ // removeOverflowed removes fractions from offloading queue that exceed size limit
253+ // Stops ongoing offloading tasks and cleans up both local and remote resources.
254+ func (lc * lifecycleManager ) removeOverflowed (sizeLimit uint64 , wg * sync.WaitGroup ) {
255+ evicted := lc .registry .EvictOverflowed (sizeLimit )
256+ for _ , item := range evicted {
257+ wg .Add (1 )
258+ go func () {
259+ defer wg .Done ()
260+ // Cancel the offloading task - this operation may take significant time
261+ // hence executed in a separate goroutine to avoid blocking
262+ lc .tasks .Cancel (item .instance .BaseFileName )
263+ }()
264+ }
265+ }
0 commit comments