Skip to content

Commit 7fe688b

Browse files
isaacreathpauloricardomg
authored andcommitted
CASSANDRA-21024: Add configuration to disk usage guardrails to stop writes across all replicas of a keyspace when any node replicating that keyspace exceeds the disk usage failure threshold.
This commit adds a new configuration, data_disk_usage_keyspace_wide_protection_enabled, which ensures that if any node which replicates a given keyspace is full, all writes to that keyspace are blocked. patch by Isaac Reath; reviewed by Stefan Miklosovic, Paulo Motta for CASSANDRA-20124 Closes #4547
1 parent bad0a10 commit 7fe688b

File tree

12 files changed

+579
-23
lines changed

12 files changed

+579
-23
lines changed

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.1
2+
* Add configuration to disk usage guardrails to stop writes across all replicas of a keyspace when any node replicating that keyspace exceeds the disk usage failure threshold. (CASSANDRA-21024)
23
* BETWEEN where token(Y) > token(Z) returns wrong answer (CASSANDRA-20154)
34
* Optimize memtable flush logic (CASSANDRA-21083)
45
* No need to evict already prepared statements, as it creates a race condition between multiple threads (CASSANDRA-17401)

conf/cassandra.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2517,6 +2517,10 @@ drop_compact_storage_enabled: false
25172517
# Min unit: B
25182518
# data_disk_usage_max_disk_size:
25192519
#
2520+
# Configures the disk usage guardrails to block all writes to a keyspace if any node which replicates that keyspace
2521+
# is full. By default, this is disabled.
2522+
# data_disk_usage_keyspace_wide_protection_enabled: false
2523+
#
25202524
# Guardrail to warn or fail when the minimum replication factor is lesser than threshold.
25212525
# This would also apply to system keyspaces.
25222526
# Suggested value for use in production: 2 or higher

conf/cassandra_latest.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2295,6 +2295,10 @@ drop_compact_storage_enabled: false
22952295
# Min unit: B
22962296
# data_disk_usage_max_disk_size:
22972297
#
2298+
# Configures the disk usage guardrails to block all writes to a keyspace if any node which replicates that keyspace
2299+
# is full. By default, this is disabled.
2300+
# data_disk_usage_keyspace_wide_protection_enabled: false
2301+
#
22982302
# Guardrail to warn or fail when the minimum replication factor is lesser than threshold.
22992303
# This would also apply to system keyspaces.
23002304
# Suggested value for use in production: 2 or higher

src/java/org/apache/cassandra/config/Config.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,7 @@ public static void setClientMode(boolean clientMode)
985985
public volatile int data_disk_usage_percentage_warn_threshold = -1;
986986
public volatile int data_disk_usage_percentage_fail_threshold = -1;
987987
public volatile DataStorageSpec.LongBytesBound data_disk_usage_max_disk_size = null;
988+
public volatile boolean data_disk_usage_keyspace_wide_protection_enabled = false;
988989
public volatile int minimum_replication_factor_warn_threshold = -1;
989990
public volatile int minimum_replication_factor_fail_threshold = -1;
990991
public volatile int maximum_replication_factor_warn_threshold = -1;

src/java/org/apache/cassandra/config/GuardrailsOptions.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,20 @@ public void setDataDiskUsagePercentageThreshold(int warn, int fail)
989989
x -> config.data_disk_usage_percentage_fail_threshold = x);
990990
}
991991

992+
993+
public boolean getDataDiskUsageKeyspaceWideProtectionEnabled()
994+
{
995+
return config.data_disk_usage_keyspace_wide_protection_enabled;
996+
}
997+
998+
public void setDataDiskUsageKeyspaceWideProtectionEnabled(boolean enabled)
999+
{
1000+
updatePropertyWithLogging("data_disk_usage_keyspace_wide_protection_enabled",
1001+
enabled,
1002+
() -> config.data_disk_usage_keyspace_wide_protection_enabled,
1003+
x -> config.data_disk_usage_keyspace_wide_protection_enabled = x);
1004+
}
1005+
9921006
@Override
9931007
public DataStorageSpec.LongBytesBound getDataDiskUsageMaxDiskSize()
9941008
{

src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
import org.apache.cassandra.exceptions.RequestExecutionException;
104104
import org.apache.cassandra.exceptions.RequestValidationException;
105105
import org.apache.cassandra.exceptions.UnauthorizedException;
106+
import org.apache.cassandra.locator.NetworkTopologyStrategy;
106107
import org.apache.cassandra.locator.Replica;
107108
import org.apache.cassandra.locator.ReplicaLayout;
108109
import org.apache.cassandra.metrics.ClientRequestSizeMetrics;
@@ -124,6 +125,7 @@
124125
import org.apache.cassandra.service.paxos.Ballot;
125126
import org.apache.cassandra.service.paxos.BallotGenerator;
126127
import org.apache.cassandra.service.paxos.Commit.Proposal;
128+
import org.apache.cassandra.tcm.ClusterMetadata;
127129
import org.apache.cassandra.transport.Dispatcher;
128130
import org.apache.cassandra.transport.messages.ResultMessage;
129131
import org.apache.cassandra.triggers.TriggerExecutor;
@@ -412,15 +414,36 @@ public void validate(ClientState state) throws InvalidRequestException
412414

413415
public void validateDiskUsage(QueryOptions options, ClientState state)
414416
{
415-
// reject writes if any replica exceeds disk usage failure limit or warn if it exceeds warn limit
416-
if (Guardrails.replicaDiskUsage.enabled(state) && DiskUsageBroadcaster.instance.hasStuffedOrFullNode())
417+
418+
if (Guardrails.diskUsageKeyspaceWideProtection.enabled(state) &&
419+
Guardrails.instance.getDataDiskUsageKeyspaceWideProtectionEnabled() &&
420+
DiskUsageBroadcaster.instance.hasStuffedOrFullNode())
421+
{
422+
Keyspace keyspace = Keyspace.open(keyspace());
423+
// If the keyspace is using NetworkTopologyStrategy then we can check each datacenter on which
424+
// the keyspace is replicated.
425+
if (keyspace.getMetadata().replicationStrategy instanceof NetworkTopologyStrategy)
426+
{
427+
for (String datacenter : ((NetworkTopologyStrategy) keyspace.getMetadata().replicationStrategy).getDatacenters())
428+
{
429+
Guardrails.diskUsageKeyspaceWideProtection.guard(datacenter, state);
430+
}
431+
}
432+
// Otherwise, if we are using SimpleStrategy then we have to check if any datacenter contains a full node.
433+
else
434+
{
435+
for (String datacenter : ClusterMetadata.current().directory.knownDatacenters())
436+
{
437+
Guardrails.diskUsageKeyspaceWideProtection.guard(datacenter, state);
438+
}
439+
}
440+
}
441+
else if (Guardrails.replicaDiskUsage.enabled(state) && DiskUsageBroadcaster.instance.hasStuffedOrFullNode())
417442
{
418443
Keyspace keyspace = Keyspace.open(keyspace());
419-
420444
for (ByteBuffer key : buildPartitionKeyNames(options, state))
421445
{
422446
Token token = metadata().partitioner.getToken(key);
423-
424447
for (Replica replica : ReplicaLayout.forTokenWriteLiveAndDown(keyspace, token).all())
425448
{
426449
Guardrails.replicaDiskUsage.guard(replica.endpoint(), state);

src/java/org/apache/cassandra/db/guardrails/Guardrails.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,19 @@ public final class Guardrails implements GuardrailsMBean
556556
(isWarning, value) ->
557557
isWarning ? "Replica disk usage exceeds warning threshold"
558558
: "Write request failed because disk usage exceeds failure threshold");
559+
/**
560+
* Guardrail on the data disk usage of replicas across a datacenter which replicates a given keyspace.
561+
* This is used at write time to verify the status of any node which might replicate a given keyspace.
562+
*/
563+
public static final Predicates<String> diskUsageKeyspaceWideProtection =
564+
new Predicates<>("disk_usage_keyspace_wide_protection",
565+
null,
566+
state -> DiskUsageBroadcaster.instance::isDatacenterStuffed,
567+
state -> DiskUsageBroadcaster.instance::isDatacenterFull,
568+
(isWarning, value) ->
569+
isWarning ? "Disk usage in keyspace datacenter exceeds warning threshold"
570+
: "Write request failed because disk usage exceeds failure threshold in keyspace datacenter.");
571+
559572
/**
560573
* Guardrail on passwords for CREATE / ALTER ROLE statements.
561574
*/
@@ -1597,6 +1610,18 @@ public void setDataDiskUsageMaxDiskSize(@Nullable String size)
15971610
DEFAULT_CONFIG.setDataDiskUsageMaxDiskSize(sizeFromString(size));
15981611
}
15991612

1613+
@Override
1614+
public boolean getDataDiskUsageKeyspaceWideProtectionEnabled()
1615+
{
1616+
return DEFAULT_CONFIG.getDataDiskUsageKeyspaceWideProtectionEnabled();
1617+
}
1618+
1619+
@Override
1620+
public void setDataDiskUsageKeyspaceWideProtectionEnabled(boolean enabled)
1621+
{
1622+
DEFAULT_CONFIG.setDataDiskUsageKeyspaceWideProtectionEnabled(enabled);
1623+
}
1624+
16001625
@Override
16011626
public int getMinimumReplicationFactorWarnThreshold()
16021627
{

src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,17 @@ public interface GuardrailsMBean
874874
@Nullable
875875
String getDataDiskUsageMaxDiskSize();
876876

877+
/**
878+
* @return Return whether a single node replicating a given keyspace being full should block writes for the
879+
* entire keyspace. Returns true if this behavior is set, false otherwise.
880+
*/
881+
boolean getDataDiskUsageKeyspaceWideProtectionEnabled();
882+
883+
/**
884+
* @param enabled Enables or disables blocking writes for a keyspace if a node replicating that keyspace is full.
885+
*/
886+
void setDataDiskUsageKeyspaceWideProtectionEnabled(boolean enabled);
887+
877888
/**
878889
* @param size The max disk size of the data directories when calculating disk usage thresholds, as a string
879890
* formatted as in, for example, {@code 10GiB}, {@code 20MiB}, {@code 30KiB} or {@code 40B}.

src/java/org/apache/cassandra/service/disk/usage/DiskUsageBroadcaster.java

Lines changed: 139 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
package org.apache.cassandra.service.disk.usage;
2020

21+
import java.util.Set;
2122
import java.util.concurrent.ConcurrentHashMap;
2223
import java.util.concurrent.ConcurrentMap;
2324
import java.util.concurrent.TimeUnit;
@@ -27,13 +28,16 @@
2728
import org.slf4j.Logger;
2829
import org.slf4j.LoggerFactory;
2930

31+
import org.apache.cassandra.config.DatabaseDescriptor;
3032
import org.apache.cassandra.gms.ApplicationState;
3133
import org.apache.cassandra.gms.EndpointState;
3234
import org.apache.cassandra.gms.Gossiper;
3335
import org.apache.cassandra.gms.IEndpointStateChangeSubscriber;
3436
import org.apache.cassandra.gms.VersionedValue;
3537
import org.apache.cassandra.locator.InetAddressAndPort;
38+
import org.apache.cassandra.locator.Locator;
3639
import org.apache.cassandra.service.StorageService;
40+
import org.apache.cassandra.tcm.membership.Location;
3741
import org.apache.cassandra.utils.NoSpamLogger;
3842

3943
/**
@@ -50,6 +54,8 @@ public class DiskUsageBroadcaster implements IEndpointStateChangeSubscriber
5054
private final DiskUsageMonitor monitor;
5155
private final ConcurrentMap<InetAddressAndPort, DiskUsageState> usageInfo = new ConcurrentHashMap<>();
5256
private volatile boolean hasStuffedOrFullNode = false;
57+
private final ConcurrentMap<String, Set<InetAddressAndPort>> fullNodesByDatacenter = new ConcurrentHashMap<>();
58+
private final ConcurrentMap<String, Set<InetAddressAndPort>> stuffedNodesByDatacenter = new ConcurrentHashMap<>();
5359

5460
@VisibleForTesting
5561
public DiskUsageBroadcaster(DiskUsageMonitor monitor)
@@ -83,6 +89,34 @@ public boolean isStuffed(InetAddressAndPort endpoint)
8389
return state(endpoint).isStuffed();
8490
}
8591

92+
/**
93+
* @return {@code true} if there exists any node in the datacenter of {@code endpoint} which has FULL disk usage.
94+
*/
95+
@VisibleForTesting
96+
public boolean isDatacenterFull(String datacenter)
97+
{
98+
if (!hasStuffedOrFullNode())
99+
{
100+
return false;
101+
}
102+
Set<InetAddressAndPort> fullNodes = fullNodesByDatacenter.get(datacenter);
103+
return fullNodes != null && !fullNodes.isEmpty();
104+
}
105+
106+
/**
107+
* @return {@code true} if there exists any node in the datacenter of {@code endpoint} which has FULL disk usage
108+
*/
109+
@VisibleForTesting
110+
public boolean isDatacenterStuffed(String datacenter)
111+
{
112+
if (!hasStuffedOrFullNode())
113+
{
114+
return false;
115+
}
116+
Set<InetAddressAndPort> stuffedNodes = stuffedNodesByDatacenter.get(datacenter);
117+
return stuffedNodes != null && !stuffedNodes.isEmpty();
118+
}
119+
86120
@VisibleForTesting
87121
public DiskUsageState state(InetAddressAndPort endpoint)
88122
{
@@ -114,8 +148,9 @@ public void onChange(InetAddressAndPort endpoint, ApplicationState state, Versio
114148
noSpamLogger.warn(String.format("Found unknown DiskUsageState: %s. Using default state %s instead.",
115149
value.value, usageState));
116150
}
117-
usageInfo.put(endpoint, usageState);
118151

152+
computeUsageStateForEpDatacenter(endpoint, usageState);
153+
usageInfo.put(endpoint, usageState);
119154
hasStuffedOrFullNode = usageState.isStuffedOrFull() || computeHasStuffedOrFullNode();
120155
}
121156

@@ -131,6 +166,85 @@ private boolean computeHasStuffedOrFullNode()
131166
return false;
132167
}
133168

169+
/**
170+
* Update the set of full nodes by datacenter based on the disk usage state for the given endpoint.
171+
* If the node is FULL, add it to the set for its datacenter. Otherwise, remove it from the set.
172+
* This method is idempotent - adding an already-present node or removing an absent node has no effect.
173+
*
174+
* @param endpoint The endpoint whose state has changed.
175+
* @param usageState The new disk usage state value.
176+
*/
177+
private void computeUsageStateForEpDatacenter(InetAddressAndPort endpoint, DiskUsageState usageState)
178+
{
179+
Location location = location(endpoint);
180+
if (location.equals(Location.UNKNOWN))
181+
{
182+
noSpamLogger.warn("Unable to track disk usage by datacenter for endpoint {} because we are unable to determine its location.",
183+
endpoint);
184+
return;
185+
}
186+
187+
String datacenter = location.datacenter;
188+
if (usageState.isFull())
189+
{
190+
// Add this node to the set of full nodes for its datacenter and remove it from the stuffed nodes
191+
// if it was there.
192+
fullNodesByDatacenter.computeIfAbsent(datacenter, dc -> ConcurrentHashMap.newKeySet())
193+
.add(endpoint);
194+
noSpamLogger.debug("Endpoint {} is FULL, added to full nodes set for datacenter {}", endpoint, datacenter);
195+
Set<InetAddressAndPort> stuffedNodes = stuffedNodesByDatacenter.get(datacenter);
196+
if (stuffedNodes != null && stuffedNodes.remove(endpoint))
197+
{
198+
noSpamLogger.debug("Endpoint {} is now FULL. Removed it from the stuffed nodes set for datacenter {}",
199+
endpoint, datacenter);
200+
}
201+
}
202+
else if (usageState.isStuffed())
203+
{
204+
// Add this node to the set of stuffed nodes for its datacenter and remove it from the full nodes
205+
// if it was there.
206+
stuffedNodesByDatacenter.computeIfAbsent(datacenter, dc -> ConcurrentHashMap.newKeySet())
207+
.add(endpoint);
208+
noSpamLogger.debug("Endpoint {} is now STUFFED. Added it to the stuffed nodes set for datacenter {}",
209+
endpoint, datacenter);
210+
Set<InetAddressAndPort> fullNodes = fullNodesByDatacenter.get(datacenter);
211+
if (fullNodes != null && fullNodes.remove(endpoint))
212+
{
213+
noSpamLogger.debug("Endpoint {} is now STUFFED. Removed it from full nodes set for datacenter {}",
214+
endpoint, datacenter);
215+
}
216+
}
217+
else
218+
{
219+
// Remove this node from the set of full nodes and set of stuffed nodes for its datacenter if it was there.
220+
Set<InetAddressAndPort> fullNodes = fullNodesByDatacenter.get(datacenter);
221+
if (fullNodes != null && fullNodes.remove(endpoint))
222+
{
223+
noSpamLogger.debug("Endpoint {} is no longer STUFFED or FULL, removed from stuffed for datacenter {}",
224+
endpoint, datacenter);
225+
}
226+
Set<InetAddressAndPort> stuffedNodes = stuffedNodesByDatacenter.get(datacenter);
227+
if (stuffedNodes != null && stuffedNodes.remove(endpoint))
228+
{
229+
noSpamLogger.debug("Endpoint {} is no longer STUFFED, removed from the stuffed set for datacenter {}",
230+
endpoint, datacenter);
231+
}
232+
}
233+
}
234+
235+
private Location location(InetAddressAndPort endpoint)
236+
{
237+
Locator locator = DatabaseDescriptor.getLocator();
238+
if (locator == null)
239+
{
240+
noSpamLogger.warn("Unable to track disk usage by datacenter for endpoint {} because locator is null",
241+
endpoint);
242+
return Location.UNKNOWN;
243+
}
244+
Location location = locator.location(endpoint);
245+
return location != null ? location : Location.UNKNOWN;
246+
}
247+
134248
@Override
135249
public void onJoin(InetAddressAndPort endpoint, EndpointState epState)
136250
{
@@ -164,10 +278,34 @@ public void onRestart(InetAddressAndPort endpoint, EndpointState state)
164278
@Override
165279
public void onRemove(InetAddressAndPort endpoint)
166280
{
281+
updateDiskUsageStateForDatacenterOnRemoval(endpoint);
167282
usageInfo.remove(endpoint);
168283
hasStuffedOrFullNode = usageInfo.values().stream().anyMatch(DiskUsageState::isStuffedOrFull);
169284
}
170285

286+
private void updateDiskUsageStateForDatacenterOnRemoval(InetAddressAndPort endpoint)
287+
{
288+
Location nodeLocation = location(endpoint);
289+
if (nodeLocation.equals(Location.UNKNOWN))
290+
{
291+
logger.debug("Unable to determine location for removed endpoint {}. Will not update datacenter tracking.", endpoint);
292+
return;
293+
}
294+
295+
String datacenter = nodeLocation.datacenter;
296+
// Remove the endpoint from the full nodes and stuffed nodes set for its datacenter
297+
Set<InetAddressAndPort> fullNodes = fullNodesByDatacenter.get(datacenter);
298+
if (fullNodes != null && fullNodes.remove(endpoint))
299+
{
300+
logger.debug("Removed endpoint {} from full nodes set for datacenter {} on node removal", endpoint, datacenter);
301+
}
302+
Set<InetAddressAndPort> stuffedNodes = stuffedNodesByDatacenter.get(datacenter);
303+
if (stuffedNodes != null && stuffedNodes.remove(endpoint))
304+
{
305+
logger.debug("Removed endpoint {} from stuffed nodes set for datacenter {} on node removal", endpoint, datacenter);
306+
}
307+
}
308+
171309
private void updateDiskUsage(InetAddressAndPort endpoint, EndpointState state)
172310
{
173311
VersionedValue localValue = state.getApplicationState(ApplicationState.DISK_USAGE);

src/java/org/apache/cassandra/utils/NoSpamLogger.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,16 @@ public static NoSpamLogger wrap(Logger wrapped, long minInterval, TimeUnit timeU
235235
return new NoSpamLogger(wrapped, minInterval, timeUnit);
236236
}
237237

238+
public boolean debug(long nowNanos, String s, Object... objects)
239+
{
240+
return NoSpamLogger.this.log( Level.DEBUG, s, nowNanos, objects);
241+
}
242+
243+
public boolean debug(String s, Object... objects)
244+
{
245+
return NoSpamLogger.this.debug(CLOCK.nanoTime(), s, objects);
246+
}
247+
238248
public boolean info(long nowNanos, String s, Object... objects)
239249
{
240250
return NoSpamLogger.this.log( Level.INFO, s, nowNanos, objects);

0 commit comments

Comments
 (0)