Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions .ci/reset_quorum_majority.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as on the minority script.

#shellcheck source=SCRIPTDIR/utils.sh
. ./.ci/utils.sh

# Network parameters
total_validators=7
majority=$(( (total_validators - 1) / 3 + 1 ))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be over 50% to be majority?

network_id=0
network_name="mainnet"

# Stopping conditions
restart_height=10
final_height=20

# Define a trap handler that cleans up all processes on exit.
trap stop_nodes EXIT

# Define a trap handler that prints a message when an error occurs
trap 'echo "⛔️ Error in $BASH_SOURCE at line $LINENO: \"$BASH_COMMAND\" failed (exit $?)"' ERR

# Start all validator nodes in the background
for ((validator_index = 0; validator_index < total_validators; validator_index++)); do
snarkos clean --dev $validator_index --network=$network_id

snarkos start --nodisplay --network $network_id --dev $validator_index --dev-num-validators $total_validators --validator &
PIDS[validator_index]=$!
echo "Started validator $validator_index with PID ${PIDS[$validator_index]}"
# Add 1-second delay between starting nodes to avoid hitting rate limits
sleep 1
done

wait_for_nodes "$total_validators" 0

total_wait=0
while true; do
if check_heights 0 "$total_validators" "$restart_height" "$network_name"; then
echo "All nodes reached restart height."

# Gracefully shut down a majority of the validators
targets=( $(generate_random_indices "$majority" $(( ${#PIDS[@]} - 1 ))) )
stop_some_nodes "${targets[@]}"

for target_index in "${targets[@]}"; do
# Remove the original ledger
snarkos clean "--network=$network_id" "--dev=$target_index"
done

# wait for a non-trivial amount of time
sleep 30

for target_index in "${targets[@]}"; do
# Restart
snarkos start --nodisplay "--network=$network_id" "--dev=$target_index" "--dev-num-validators=$total_validators" \
--validator &
PIDS[target_index]=$!
echo "Restarted a fresh validator $target_index with PID ${PIDS[$target_index]}"
# Add 1-second delay between starting nodes to avoid hitting rate limits
sleep 1
done

total_wait=$((total_wait + 30 + $majority))

break
fi

sleep 3
total_wait=$((total_wait + 3))
done

while (( total_wait < 600 )); do # 10 minutes max
if check_heights 0 "$total_validators" "$final_height" "$network_name"; then
echo "SUCCESS!"
exit 0
fi

# Continue waiting
sleep 3
total_wait=$((total_wait + 3))
echo "Waited $total_wait seconds so far..."
done

# The main loop has expired by now
echo "❌ Test failed!"
exit 1
84 changes: 84 additions & 0 deletions .ci/reset_quorum_minority.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The script itself seems to do its idea right - in a great way.

If you want you can do some of my suggestions I added to chaotic-network-runner.sh.
The strict mode, variables use helpers for logging, actually shared helper functions for logging and exiting are coming from utils.sh in this PR. Maybe use them + strict mode + quoting the $ vars.

#shellcheck source=SCRIPTDIR/utils.sh
. ./.ci/utils.sh

# Network parameters
total_validators=7
minority=$(( (total_validators - 1) / 3 ))
network_id=0
network_name="mainnet"

# Stopping conditions
num_iterations=3
final_height=40

# Define a trap handler that cleans up all processes on exit.
trap stop_nodes EXIT

# Define a trap handler that prints a message when an error occurs
trap 'echo "⛔️ Error in $BASH_SOURCE at line $LINENO: \"$BASH_COMMAND\" failed (exit $?)"' ERR

# Start all validator nodes in the background
for ((validator_index = 0; validator_index < total_validators; validator_index++)); do
snarkos clean --dev $validator_index --network=$network_id

snarkos start --nodisplay --network $network_id --dev $validator_index --dev-num-validators $total_validators --validator &
PIDS[validator_index]=$!
echo "Started validator $validator_index with PID ${PIDS[$validator_index]}"
# Add 1-second delay between starting nodes to avoid hitting rate limits
sleep 1
done

wait_for_nodes "$total_validators" 0

total_wait=0
for ((iter = 1; iter <= num_iterations; iter++)); do
restart_height=$(( iter * 10 ));

while true; do
if check_heights 0 "$total_validators" "$restart_height" "$network_name"; then
echo "All nodes reached restart height."

# Gracefully shut down a minority of the validators
targets=( $(generate_random_indices "$minority" $(( ${#PIDS[@]} - 1 ))) )
stop_some_nodes "${targets[@]}"

for target_index in "${targets[@]}"; do
# Remove the original ledger
snarkos clean "--network=$network_id" "--dev=$target_index"
# Wait until the cleanup concludes
sleep 1
# Restart
snarkos start --nodisplay "--network=$network_id" "--dev=$target_index" "--dev-num-validators=$total_validators" \
--validator &
PIDS[target_index]=$!
echo "Restarted a fresh validator $target_index with PID ${PIDS[$target_index]}"
# Add 1-second delay between starting nodes to avoid hitting rate limits
sleep 1
total_wait=$((total_wait + 2))
done

break
fi

sleep 3
total_wait=$((total_wait + 3))
done
done

while (( total_wait < 600 )); do # 10 minutes max
if check_heights 0 "$total_validators" "$final_height" "$network_name"; then
echo "SUCCESS!"
exit 0
fi

# Continue waiting
sleep 3
total_wait=$((total_wait + 3))
echo "Waited $total_wait seconds so far..."
done

# The main loop has expired by now
echo "❌ Test failed!"
exit 1
45 changes: 44 additions & 1 deletion .ci/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ function get_network_name() {
esac
}

# Stops all running processe in the given list.
# Stops all running processes in the PIDS list.
function stop_nodes() {
echo "🚨 Cleaning up ${#PIDS[@]} process(es)…"
for pid in "${PIDS[@]}"; do
Expand All @@ -156,6 +156,49 @@ function stop_nodes() {
wait
}

# Generates the given number of random indices up to max_index.
function generate_random_indices() {
local count=$1
local max_index=$2

# Check if count is greater than max_index + 1 (impossible request)
if (( count > max_index + 1 )); then
echo "Error: Cannot request more unique indices than exist." >&2
return 1
fi

# shuf -i generates a range (0 to max), -n picks N items
shuf -i 0-"$max_index" -n "$count"
}

# Stops select running processes from the PIDS list.
function stop_some_nodes() {
local indices=("$@")
local killed_pids=()

echo "🚨 Stopping ${#indices[@]} selected node(s)..."

for i in "${indices[@]}"; do
# Get the PID from the global PIDS array using the index
local pid="${PIDS[$i]}"

# Check if PID exists (is not empty) and is currently running
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
echo "Killing PIDS[$i] -> $pid"
kill -9 "$pid" 2>/dev/null || true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to kill the nodes with -9, better SIGTERM? Maybe it can lead to some problems with the ledger and random failures, maybe not, just asking?

Otherwise these two functions seem great to me.

# Add to list of PIDs to wait for specifically
killed_pids+=("$pid")
else
echo "Skipping PIDS[$i] (PID: $pid) - Already dead or invalid."
fi
done

# Wait only for the processes we just killed to ensure they are gone
if [ ${#killed_pids[@]} -gt 0 ]; then
wait "${killed_pids[@]}" 2>/dev/null || true
fi
}

# Succeeds if all nodes are available.
function check_nodes() {
local total_validators=$1
Expand Down
Loading