Skip to content

Commit afa5512

Browse files
Fix initial auto-repairs skipped by too soon check
patch by Paulo Motta; reviewed by Jaydeepkumar Chovatia for CASSANDRA-21115
1 parent 099f376 commit afa5512

File tree

8 files changed

+186
-41
lines changed

8 files changed

+186
-41
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.1
2+
* Fix initial auto-repairs skipped by too soon check (CASSANDRA-21115)
23
* Add configuration to disk usage guardrails to stop writes across all replicas of a keyspace when any node replicating that keyspace exceeds the disk usage failure threshold. (CASSANDRA-21024)
34
* BETWEEN where token(Y) > token(Z) returns wrong answer (CASSANDRA-20154)
45
* Optimize memtable flush logic (CASSANDRA-21083)

src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@ public void repair(AutoRepairConfig.RepairType repairType)
209209
long startTimeInMillis = timeFunc.get();
210210
logger.info("My host id: {}, my turn to run repair...repair primary-ranges only? {}", myId,
211211
config.getRepairPrimaryTokenRangeOnly(repairType));
212-
AutoRepairUtils.updateStartAutoRepairHistory(repairType, myId, timeFunc.get(), turn);
212+
AutoRepairUtils.updateStartAutoRepairHistory(repairType, myId, startTimeInMillis, turn);
213+
repairState.setLastRepairStartTime(startTimeInMillis);
213214

214215
repairState.setRepairKeyspaceCount(0);
215216
repairState.setRepairInProgress(true);
@@ -402,20 +403,31 @@ else if (retryCount < config.getRepairMaxRetries(repairType))
402403
}
403404
}
404405

405-
private boolean tooSoonToRunRepair(AutoRepairConfig.RepairType repairType, AutoRepairState repairState, AutoRepairConfig config, UUID myId)
406+
@VisibleForTesting
407+
boolean tooSoonToRunRepair(AutoRepairConfig.RepairType repairType, AutoRepairState repairState, AutoRepairConfig config, UUID myId)
406408
{
407-
if (repairState.getLastRepairTime() == 0)
409+
if (repairState.getLastRepairFinishTime() == 0)
408410
{
409-
// the node has either just boooted or has not run repair before,
411+
// the node has either just booted or has not run repair before,
410412
// we should check for the node's repair history in the DB
411-
repairState.setLastRepairTime(AutoRepairUtils.getLastRepairTimeForNode(repairType, myId));
413+
repairState.setLastRepairFinishTime(AutoRepairUtils.getLastRepairFinishTimeForNode(repairType, myId));
414+
repairState.setLastRepairStartTime(AutoRepairUtils.getLastRepairStartTimeForNode(repairType, myId));
412415
}
416+
417+
// If repair has not completed (start >= finish), don't skip - allow it to continue/resume
418+
if (repairState.getLastRepairStartTime() >= repairState.getLastRepairFinishTime())
419+
{
420+
logger.info("Incomplete or unstarted repair detected (start_ts={} >= finish_ts={}), allowing resume",
421+
repairState.getLastRepairStartTime(), repairState.getLastRepairFinishTime());
422+
return false;
423+
}
424+
413425
/*
414426
* check if it is too soon to run repair. one of the reason we
415427
* should not run frequent repair is that repair triggers
416428
* memtable flush
417429
*/
418-
long timeElapsedSinceLastRepair = TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() - repairState.getLastRepairTime());
430+
long timeElapsedSinceLastRepair = TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() - repairState.getLastRepairFinishTime());
419431
if (timeElapsedSinceLastRepair < config.getRepairMinInterval(repairType).toSeconds())
420432
{
421433
logger.info("Too soon to run repair, last repair was done {} seconds ago",
@@ -497,14 +509,14 @@ private void cleanupAndUpdateStats(RepairTurn turn, AutoRepairConfig.RepairType
497509
"repairTokenRangesSkipCount {}, repairTablesSkipCount {}", repairType, timeInHours, repairState.getRepairKeyspaceCount(),
498510
repairState.getSucceededTokenRangesCount(), repairState.getFailedTokenRangesCount(),
499511
repairState.getSkippedTokenRangesCount(), repairState.getSkippedTablesCount());
500-
if (repairState.getLastRepairTime() != 0)
512+
if (repairState.getLastRepairFinishTime() != 0)
501513
{
502514
repairState.setClusterRepairTimeInSec((int) TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() -
503-
repairState.getLastRepairTime()));
515+
repairState.getLastRepairFinishTime()));
504516
logger.info("Cluster repair time for repair type {}: {} day(s)", repairType,
505517
TimeUnit.SECONDS.toDays(repairState.getClusterRepairTimeInSec()));
506518
}
507-
repairState.setLastRepairTime(timeFunc.get());
519+
repairState.setLastRepairFinishTime(timeFunc.get());
508520
repairState.setRepairInProgress(false);
509521

510522
AutoRepairUtils.updateFinishAutoRepairHistory(repairType, myId, timeFunc.get());

src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ public abstract class AutoRepairState
6464
@VisibleForTesting
6565
protected int totalTablesConsideredForRepair = 0;
6666
@VisibleForTesting
67-
protected long lastRepairTimeInMs;
67+
protected long lastRepairFinishTimeInMs;
68+
@VisibleForTesting
69+
protected long lastRepairStartTimeInMs;
6870
@VisibleForTesting
6971
protected int nodeRepairTimeInSec = 0;
7072
@VisibleForTesting
@@ -120,9 +122,9 @@ public void updateRepairScheduleStatistics(List<PrioritizedRepairPlan> repairPla
120122
setTotalKeyspaceRepairPlansToRepair(repairPlans.stream().mapToInt(repairPlan -> repairPlan.getKeyspaceRepairPlans().size()).sum());
121123
}
122124

123-
public long getLastRepairTime()
125+
public long getLastRepairFinishTime()
124126
{
125-
return lastRepairTimeInMs;
127+
return lastRepairFinishTimeInMs;
126128
}
127129

128130
public void setTotalTablesConsideredForRepair(int count)
@@ -135,9 +137,19 @@ public int getTotalTablesConsideredForRepair()
135137
return totalTablesConsideredForRepair;
136138
}
137139

138-
public void setLastRepairTime(long lastRepairTime)
140+
public void setLastRepairFinishTime(long lastRepairFinishTime)
141+
{
142+
lastRepairFinishTimeInMs = lastRepairFinishTime;
143+
}
144+
145+
public long getLastRepairStartTime()
146+
{
147+
return lastRepairStartTimeInMs;
148+
}
149+
150+
public void setLastRepairStartTime(long lastRepairStartTime)
139151
{
140-
lastRepairTimeInMs = lastRepairTime;
152+
lastRepairStartTimeInMs = lastRepairStartTime;
141153
}
142154

143155
public int getClusterRepairTimeInSec()

src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,16 @@ public class AutoRepairUtils
177177
"SELECT %s FROM %s.%s WHERE %s = ? AND %s = ?", COL_REPAIR_FINISH_TS, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME,
178178
SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE, COL_HOST_ID);
179179

180+
final static String SELECT_LAST_REPAIR_START_TIME_FOR_NODE = String.format(
181+
"SELECT %s FROM %s.%s WHERE %s = ? AND %s = ?", COL_REPAIR_START_TS, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME,
182+
SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE, COL_HOST_ID);
183+
180184
static ModificationStatement delStatementRepairHistory;
181185
static SelectStatement selectStatementRepairHistory;
182186
static ModificationStatement delStatementPriorityStatus;
183187
static SelectStatement selectStatementRepairPriority;
184188
static SelectStatement selectLastRepairTimeForNode;
189+
static SelectStatement selectLastRepairStartTimeForNode;
185190
static ModificationStatement addPriorityHost;
186191
static ModificationStatement insertNewRepairHistoryStatement;
187192
static ModificationStatement recordStartRepairHistoryStatement;
@@ -207,6 +212,8 @@ public static void setup()
207212
.forInternalCalls());
208213
selectLastRepairTimeForNode = (SelectStatement) QueryProcessor.getStatement(SELECT_LAST_REPAIR_TIME_FOR_NODE, ClientState
209214
.forInternalCalls());
215+
selectLastRepairStartTimeForNode = (SelectStatement) QueryProcessor.getStatement(SELECT_LAST_REPAIR_START_TIME_FOR_NODE, ClientState
216+
.forInternalCalls());
210217
delStatementPriorityStatus = (ModificationStatement) QueryProcessor.getStatement(DEL_REPAIR_PRIORITY, ClientState
211218
.forInternalCalls());
212219
addPriorityHost = (ModificationStatement) QueryProcessor.getStatement(ADD_PRIORITY_HOST, ClientState
@@ -382,7 +389,8 @@ public static void setForceRepairNewNode(RepairType repairType)
382389
// this function will be called when a node bootstrap finished
383390
UUID hostId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort());
384391
// insert the data first
385-
insertNewRepairHistory(repairType, currentTimeMillis(), currentTimeMillis());
392+
long timestamp = currentTimeMillis();
393+
insertNewRepairHistory(repairType, timestamp, timestamp);
386394
setForceRepair(repairType, hostId);
387395
}
388396

@@ -407,7 +415,7 @@ public static void setForceRepair(RepairType repairType, UUID hostId)
407415
logger.info("Set force repair repair type: {}, node: {}", repairType, hostId);
408416
}
409417

410-
public static long getLastRepairTimeForNode(RepairType repairType, UUID hostId)
418+
public static long getLastRepairFinishTimeForNode(RepairType repairType, UUID hostId)
411419
{
412420
ResultMessage.Rows rows = selectLastRepairTimeForNode.execute(QueryState.forInternalCalls(),
413421
QueryOptions.forInternalCalls(internalQueryCL,
@@ -423,6 +431,22 @@ public static long getLastRepairTimeForNode(RepairType repairType, UUID hostId)
423431
return repairTime.one().getLong(COL_REPAIR_FINISH_TS);
424432
}
425433

434+
public static long getLastRepairStartTimeForNode(RepairType repairType, UUID hostId)
435+
{
436+
ResultMessage.Rows rows = selectLastRepairStartTimeForNode.execute(QueryState.forInternalCalls(),
437+
QueryOptions.forInternalCalls(internalQueryCL,
438+
Lists.newArrayList(
439+
ByteBufferUtil.bytes(repairType.toString()),
440+
ByteBufferUtil.bytes(hostId))),
441+
Dispatcher.RequestTime.forImmediateExecution());
442+
UntypedResultSet repairTime = UntypedResultSet.create(rows.result);
443+
if (repairTime.isEmpty())
444+
{
445+
return 0;
446+
}
447+
return repairTime.one().getLong(COL_REPAIR_START_TS);
448+
}
449+
426450
@VisibleForTesting
427451
public static CurrentRepairStatus getCurrentRepairStatus(RepairType repairType, List<AutoRepairHistory> autoRepairHistories, UUID myId)
428452
{
@@ -840,7 +864,8 @@ else if (!hostIdsInCurrentRing.contains(nodeHistory.hostId))
840864
if (!autoRepairHistoryIds.contains(hostId))
841865
{
842866
logger.info("{} for repair type {} doesn't exist in the auto repair history table, insert a new record.", repairType, hostId);
843-
insertNewRepairHistory(repairType, hostId, currentTimeMillis(), currentTimeMillis());
867+
long timestamp = currentTimeMillis();
868+
insertNewRepairHistory(repairType, hostId, timestamp, timestamp);
844869
}
845870
}
846871

0 commit comments

Comments
 (0)