@@ -21,6 +21,7 @@ type lifecycleManager struct {
2121 provider * fractionProvider // Provider for fraction operations
2222 flags * StateManager // Storage state flags
2323 registry * fractionRegistry // Fraction state registry
24+ tasks * TaskManager // Background offloading tasks
2425
2526 sealingWg sync.WaitGroup
2627}
@@ -36,19 +37,27 @@ func newLifecycleManager(
3637 provider : provider ,
3738 flags : flags ,
3839 registry : registry ,
40+ tasks : NewTaskManager (),
3941 }
4042}
4143
4244// Maintain performs periodic lifecycle management tasks.
4345// It is a CORE method of lifecycleManager
4446// Coordinates rotation, offloading, cleanup based on configuration.
45- func (lc * lifecycleManager ) Maintain (ctx context.Context , config * Config , wg * sync.WaitGroup ) {
46- lc .Rotate (config .FracSize , wg )
47- if config .OffloadingEnabled {
48- lc .OffloadLocal (ctx , config .TotalSize , wg )
49- lc .CleanRemote (config .OffloadingRetention , wg )
47+ func (lc * lifecycleManager ) Maintain (ctx context.Context , cfg * Config , wg * sync.WaitGroup ) {
48+
49+ suspendThreshold := cfg .TotalSize + cfg .TotalSize / 100 + cfg .OffloadingQueueSize
50+ lc .registry .SuspendIfOverCapacity (cfg .SealingQueueLen , suspendThreshold )
51+
52+ lc .Rotate (cfg .FracSize , wg )
53+ if cfg .OffloadingEnabled {
54+ lc .OffloadLocal (ctx , cfg .TotalSize , cfg .OffloadingRetryDelay , wg )
55+ if cfg .OffloadingQueueSize > 0 {
56+ lc .RemoveOverflowed (cfg .OffloadingQueueSize , wg )
57+ }
58+ lc .CleanRemote (cfg .OffloadingRetention , wg )
5059 } else {
51- lc .CleanLocal (config .TotalSize , wg )
60+ lc .CleanLocal (cfg .TotalSize , wg )
5261 }
5362 lc .UpdateOldestMetric ()
5463 lc .SyncInfoCache ()
@@ -113,17 +122,18 @@ func (lc *lifecycleManager) Rotate(maxSize uint64, wg *sync.WaitGroup) {
113122
114123// OffloadLocal starts offloading of local fractions to remote storage
115124// Selects fractions based on disk space usage and retention policy.
116- func (lc * lifecycleManager ) OffloadLocal (ctx context.Context , sizeLimit uint64 , wg * sync.WaitGroup ) {
125+ func (lc * lifecycleManager ) OffloadLocal (ctx context.Context , sizeLimit uint64 , retryDelay time. Duration , wg * sync.WaitGroup ) {
117126 toOffload , err := lc .registry .EvictLocal (true , sizeLimit )
118127 if err != nil {
119128 logger .Fatal ("error releasing old fractions:" , zap .Error (err ))
120129 }
121130 for _ , sealed := range toOffload {
122131 wg .Add (1 )
123- go func () {
132+ lc . tasks . Run ( sealed . instance . BaseFileName , ctx , func (ctx context. Context ) {
124133 defer wg .Done ()
125134
126- remote , _ := lc .TryOffload (ctx , sealed .instance )
135+ remote := lc .OffloadWithRetry (ctx , sealed .instance , retryDelay )
136+
127137 lc .registry .PromoteToRemote (sealed , remote )
128138
129139 if remote == nil {
@@ -136,7 +146,41 @@ func (lc *lifecycleManager) OffloadLocal(ctx context.Context, sizeLimit uint64,
136146 // Free up local resources
137147 sealed .instance .Suicide ()
138148 maintenanceTruncateTotal .Add (1 )
139- }()
149+ })
150+ }
151+ }
152+
153+ // OffloadWithRetry attempts to offload a fraction with retries until success or cancellation.
154+ // Returns the remote fraction instance and a boolean indicating whether offloading was not canceled.
155+ func (lc * lifecycleManager ) OffloadWithRetry (ctx context.Context , sealed * frac.Sealed , retryDelay time.Duration ) * frac.Remote {
156+ start := time .Now ()
157+ for i := 0 ; ; i ++ {
158+ remote , err := lc .TryOffload (ctx , sealed )
159+ if err == nil {
160+ return remote
161+ }
162+
163+ logger .Warn (
164+ "fail to offload fraction" ,
165+ zap .String ("name" , sealed .BaseFileName ),
166+ zap .Duration ("offloading_time" , time .Since (start )),
167+ zap .Int ("attempts" , i ),
168+ zap .Error (err ),
169+ )
170+
171+ select {
172+ case <- ctx .Done ():
173+ logger .Info (
174+ "fraction offloading was stopped" ,
175+ zap .String ("name" , sealed .BaseFileName ),
176+ zap .Duration ("offloading_time" , time .Since (start )),
177+ zap .Int ("attempts" , i ),
178+ zap .Error (ctx .Err ()),
179+ )
180+ return nil
181+ case <- time .After (retryDelay ):
182+ // Wait before next retry attempt
183+ }
140184 }
141185}
142186
@@ -163,9 +207,6 @@ func (lc *lifecycleManager) TryOffload(ctx context.Context, sealed *frac.Sealed)
163207
164208// CleanRemote deletes outdated remote fractions based on retention policy
165209func (lc * lifecycleManager ) CleanRemote (retention time.Duration , wg * sync.WaitGroup ) {
166- if retention == 0 {
167- return
168- }
169210 toDelete := lc .registry .EvictRemote (retention )
170211 wg .Add (1 )
171212 go func () {
@@ -202,6 +243,21 @@ func (lc *lifecycleManager) CleanLocal(sizeLimit uint64, wg *sync.WaitGroup) {
202243 }()
203244}
204245
246+ // RemoveOverflowed removes fractions from offloading queue that exceed size limit
247+ // Stops ongoing offloading tasks and cleans up both local and remote resources.
248+ func (lc * lifecycleManager ) RemoveOverflowed (sizeLimit uint64 , wg * sync.WaitGroup ) {
249+ evicted := lc .registry .EvictOverflowed (sizeLimit )
250+ for _ , item := range evicted {
251+ wg .Add (1 )
252+ go func () {
253+ defer wg .Done ()
254+ // Cancel the offloading task - this operation may take significant time
255+ // hence executed in a separate goroutine to avoid blocking
256+ lc .tasks .Cancel (item .instance .BaseFileName )
257+ }()
258+ }
259+ }
260+
205261// UpdateOldestMetric updates the prometheus metric with oldest fraction timestamp
206262func (lc * lifecycleManager ) UpdateOldestMetric () {
207263 oldestFracTime .WithLabelValues ("remote" ).Set ((time .Duration (lc .registry .OldestTotal ()) * time .Millisecond ).Seconds ())
0 commit comments