@@ -26,6 +26,7 @@ const (
2626 defaultMinGroupTaskSize = 2
2727 defaultOfflineNodeRetentionPeriod = time .Minute * 20
2828 defaultBannedNodeHoldOffPeriod = time .Minute * 3
29+ defaultLVCUnbanWindow = time .Hour * 2
2930 defaultDisabledSchedulerJobsWaitTimeout = time .Minute * 20
3031 defaultDisabledWriteSessionsWaitTimeout = time .Minute * 20
3132 defaultOfflineHTTPProxyRetentionPeriod = time .Minute * 10
@@ -99,6 +100,10 @@ type TaskProcessorConfig struct {
99100
100101 OfflineNodeRetentionPeriod time.Duration `yaml:"offline_node_retention_period"`
101102 BannedNodeHoldOffPeriod time.Duration `yaml:"banned_node_hold_off_period"`
103+ MaxBannedNodeHoldOffPeriod time.Duration `yaml:"max_banned_node_hold_off_period"`
104+ // LVCUnbanWindow is a duration, after which node will not be unbanned because of LVC.
105+ // It is assumed that all lost chunks will be found before LVCUnbanWindow has expired.
106+ LVCUnbanWindow time.Duration `yaml:"lvc_unban_window"`
102107 // DisabledSchedulerJobsWaitTimeout is a max time to wait for node's scheduler jobs to finish
103108 // in fast decommission scenarios.
104109 DisabledSchedulerJobsWaitTimeout time.Duration `yaml:"disabled_scheduler_jobs_wait_timeout"`
@@ -194,6 +199,14 @@ func (c *TaskProcessorConfig) UnmarshalYAML(unmarshal func(any) error) error {
194199 c .BannedNodeHoldOffPeriod = defaultBannedNodeHoldOffPeriod
195200 }
196201
202+ if c .MaxBannedNodeHoldOffPeriod <= 0 {
203+ c .MaxBannedNodeHoldOffPeriod = 3 * c .BannedNodeHoldOffPeriod
204+ }
205+
206+ if c .LVCUnbanWindow <= 0 {
207+ c .LVCUnbanWindow = defaultLVCUnbanWindow
208+ }
209+
197210 if c .DisabledSchedulerJobsWaitTimeout <= 0 {
198211 c .DisabledSchedulerJobsWaitTimeout = defaultDisabledSchedulerJobsWaitTimeout
199212 }
@@ -313,12 +326,12 @@ type TaskProcessor struct {
313326 // hostAnnotations store additional host information retrieved from Wall-e.
314327 hostAnnotations * HostAnnotations
315328
316- // lastNodeBanTime stores a time of the latest node ban made by CMS.
317- // This value is used to limit the number of parallel node bans.
318- lastNodeBanTime time.Time
319329 // lastBannedNodeGroup stores group name of a last banned node
320330 // belonging to a group task.
321331 lastBannedNodeGroup string
332+ // nodeBanLimiter stores a time of the latest node ban and unbans made by CMS.
333+ // It is used to limit the number of parallel node bans, increasing ban interval, if LVC found.
334+ nodeBanLimiter * NodeBanLimiter
322335
323336 chunkIntegrity * ytsys.ChunkIntegrity
324337 missingChunksThrottler * MissingChunksThrottler
@@ -379,6 +392,7 @@ func (p *TaskProcessor) reset(tasks []*models.Task) {
379392 p .rpcProxyRoleLimits = NewProxyRoleLimits (p .Conf ().MaxRPCProxiesPerRole , & rpcProxyCache {c : p .cluster })
380393 p .initRateLimiter (tasks )
381394 p .initGPURateLimiter (tasks )
395+ p .nodeBanLimiter = NewNodeBanLimiter (p .conf .BannedNodeHoldOffPeriod , p .conf .MaxBannedNodeHoldOffPeriod )
382396 p .initLastNodeBanTime (tasks )
383397}
384398
@@ -429,24 +443,27 @@ func (p *TaskProcessor) initGPURateLimiter(tasks []*models.Task) {
429443
430444// initLastNodeBanTime initializes last node ban time.
431445func (p * TaskProcessor ) initLastNodeBanTime (tasks []* models.Task ) {
432- if ! p .lastNodeBanTime .IsZero () {
446+ if ! p .nodeBanLimiter . LastBanTime .IsZero () {
433447 return
434448 }
435449
450+ var lastBanTime time.Time
451+
436452 for _ , t := range tasks {
437453 for _ , n := range t .GetNodes () {
438454 banTime := time .Time (n .BanTime )
439- if n .Banned && banTime .After (p . lastNodeBanTime ) {
440- p . lastNodeBanTime = banTime
455+ if n .Banned && banTime .After (lastBanTime ) {
456+ lastBanTime = banTime
441457 if t .IsGroupTask {
442458 p .lastBannedNodeGroup = t .TaskGroup
443459 }
444460 }
445461 }
446462 }
447463
464+ p .nodeBanLimiter .LastBanTime = lastBanTime
448465 p .l .Info ("setting last node ban time" ,
449- log .Time ("last_node_ban_time" , p . lastNodeBanTime ),
466+ log .Time ("last_node_ban_time" , lastBanTime ),
450467 log .String ("last_banned_node_group" , p .lastBannedNodeGroup ))
451468}
452469
@@ -656,10 +673,10 @@ func (p *TaskProcessor) checkChunkIntegrity(ctx context.Context) error {
656673
657674 if i .LVC > 0 || i .QMC > 0 {
658675 p .l .Info ("LVC > 0 or QMC > 0 -> unbanning nodes" , log .Int64 ("lvc" , i .LVC ), log .Int64 ("qmc" , i .QMC ))
659- if err := p .unbanNodes (ctx ); err != nil {
676+ if err := p .unbanNodesBecauseOfLVC (ctx ); err != nil {
660677 p .l .Error ("error unbanning nodes" , log .Error (err ))
661678 } else {
662- p .lastNodeBanTime = time.Time {}
679+ p .nodeBanLimiter . LastBanTime = time.Time {}
663680 }
664681 } else {
665682 p .l .Info ("LVC is 0" )
0 commit comments