update fix zookeeper

SylvainSenechal · SylvainSenechal · commit 01513d5e9684 · 2026-02-02T11:49:06.000+01:00
diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh
@@ -3,7 +3,19 @@
 set -e
 
 env_variables=$(yq eval '.env | to_entries | .[] | .key + "=" + .value' .github/workflows/end2end.yaml | sed 's/\${{[^}]*}}//g') && export $env_variables
-export GIT_ACCESS_TOKEN=${GITHUB_TOKEN}
+
+# In CI, GIT_ACCESS_TOKEN comes from a GitHub App token.
+# Locally, we need the user to provide one (usually via GITHUB_TOKEN).
+if [[ -z "${GIT_ACCESS_TOKEN:-}" ]]; then
+    export GIT_ACCESS_TOKEN="${GITHUB_TOKEN:-}"
+fi
+
+if [[ -z "${GIT_ACCESS_TOKEN:-}" ]]; then
+    echo "ERROR: Missing GitHub token. Set GITHUB_TOKEN (or GIT_ACCESS_TOKEN) with access to scality/zenko-operator." >&2
+    echo "Example: export GITHUB_TOKEN=ghp_***" >&2
+    exit 1
+fi
+
 export E2E_IMAGE_TAG=latest
 
 # Disable GCP tests as we don't have credentials setup in devcontainer
@@ -21,11 +33,27 @@ for i in $(seq 0 $array_length); do
     #step=$(yq ".runs.steps[$i]" .github/actions/deploy/action.yaml)
     working_dir=$(yq ".runs.steps[$i].working-directory" .github/actions/deploy/action.yaml)
     run_command=$(yq ".runs.steps[$i].run" .github/actions/deploy/action.yaml)
+    step_if=$(yq ".runs.steps[$i].if" .github/actions/deploy/action.yaml)
 
     # We don't want to run `run-e2e-test.sh` because it is used for linting here, user will run it manually if needed after deployment
     # We can't run `configure-e2e.sh` here because it needs an image that is not yet built and sent to kind, will be run after
     (
-        if [[ "$run_command" != "null" && "$run_command" != *"configure-e2e.sh"* && "$run_command" != *"run-e2e-test.sh"* ]]; then
+        should_run=true
+
+        # Best-effort support for composite action `if:` (CI evaluates these, local runner must emulate).
+        if [[ "$step_if" != "null" ]]; then
+            # Only conditional step in the deploy action today.
+            if [[ "$step_if" == *"inputs.deploy_metadata"* ]]; then
+                if [[ "${GITHUB_INPUTS_deploy_metadata:-false}" != "true" ]]; then
+                    should_run=false
+                fi
+            else
+                echo "Skipping step with unsupported condition: $step_if"
+                should_run=false
+            fi
+        fi
+
+        if [[ "$should_run" == "true" && "$run_command" != "null" && "$run_command" != *"configure-e2e.sh"* && "$run_command" != *"run-e2e-test.sh"* ]]; then
             # Inject env 'generated' from previous steps
             source "$GITHUB_ENV"
 
diff --git a/.github/scripts/end2end/fix-zookeeper.sh b/.github/scripts/end2end/fix-zookeeper.sh
@@ -11,6 +11,13 @@ NAMESPACE="${2:?Missing NAMESPACE argument}"
 ZK_STS_NAME="${ZENKO_NAME}-base-quorum"
 ZK_CONTAINER_NAME="zookeeper"
 ZK_POD_NAME="${ZK_STS_NAME}-0"
+# Name of the Pravega zookeeper-operator deployment.
+# Can be overridden, otherwise we auto-detect.
+ZK_OPERATOR_DEPLOYMENT="${ZK_OPERATOR_DEPLOYMENT:-}"
+
+# By default keep the operator scaled down.
+# This avoids it reconciling/reverting the JVMFLAGS workaround.
+ZK_OPERATOR_KEEP_SCALED_DOWN="${ZK_OPERATOR_KEEP_SCALED_DOWN:-true}"
 
 OPERATOR_WAIT_TIMEOUT=120
 STATEFULSET_WAIT_TIMEOUT=180
@@ -21,6 +28,45 @@ get_elapsed() {
     echo $(($(date +%s) - start_time))
 }
 
+normalize_k8s_name() {
+  # Accept either a plain name (foo) or a resource/name form (deployment.apps/foo, deployment/foo, deploy/foo)
+  # and return only the name part (foo).
+  local value="${1:-}"
+  if [[ -z "$value" ]]; then
+    echo ""
+    return 0
+  fi
+  echo "$value" | awk -F/ '{print $NF}'
+}
+
+detect_zk_operator_deployment() {
+  if [[ -n "${ZK_OPERATOR_DEPLOYMENT}" ]]; then
+    ZK_OPERATOR_DEPLOYMENT="$(normalize_k8s_name "${ZK_OPERATOR_DEPLOYMENT}")"
+    return 0
+  fi
+
+  # Prefer Helm labels if present (release name is typically 'zk-operator')
+  ZK_OPERATOR_DEPLOYMENT=$(kubectl -n "${NAMESPACE}" get deploy \
+    -l app.kubernetes.io/instance=zk-operator \
+    -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+
+  # Fallback: best-effort name match
+  if [[ -z "${ZK_OPERATOR_DEPLOYMENT}" ]]; then
+    ZK_OPERATOR_DEPLOYMENT=$(kubectl -n "${NAMESPACE}" get deploy \
+      -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null \
+      | grep -E 'zookeeper-operator|zk-operator' \
+      | head -n 1 || true)
+  fi
+
+  if [[ -z "${ZK_OPERATOR_DEPLOYMENT}" ]]; then
+    echo "ERROR: Could not detect zookeeper-operator deployment in namespace ${NAMESPACE}." >&2
+    echo "Hint: run 'kubectl -n ${NAMESPACE} get deploy' and set ZK_OPERATOR_DEPLOYMENT explicitly." >&2
+    exit 1
+  fi
+
+  ZK_OPERATOR_DEPLOYMENT="$(normalize_k8s_name "${ZK_OPERATOR_DEPLOYMENT}")"
+}
+
 # Wait for the Zenko operator to process the CR
 OPERATOR_WAIT_START=$(date +%s)
 
@@ -61,8 +107,18 @@ while true; do
     sleep 2
 done
 
+# Detect zk-operator deployment name
+detect_zk_operator_deployment
+
 # Patch the StatefulSet with JVM flags to disable container support
 # as ubuntu runners now are incompatible with zookeeper.
+# We need to scale down the zookeeper operator first, otherwise it will
+# reconcile and revert our patch.
+echo "Scaling down Zookeeper operator to prevent reconciliation..."
+kubectl -n "${NAMESPACE}" scale "deployment/${ZK_OPERATOR_DEPLOYMENT}" --replicas=0
+kubectl -n "${NAMESPACE}" rollout status "deployment/${ZK_OPERATOR_DEPLOYMENT}" --timeout=60s || true
+
+echo "Patching StatefulSet with JVMFLAGS..."
 kubectl -n "${NAMESPACE}" patch statefulset "${ZK_STS_NAME}" --type='strategic' \
   -p '{
     "spec": {
@@ -84,14 +140,33 @@ kubectl -n "${NAMESPACE}" patch statefulset "${ZK_STS_NAME}" --type='strategic'
     }
   }'
 
+# Verify patch stuck (and was not immediately reverted)
+echo "Verifying JVMFLAGS is present on StatefulSet template..."
+if ! kubectl -n "${NAMESPACE}" get statefulset "${ZK_STS_NAME}" \
+    -o jsonpath='{.spec.template.spec.containers[?(@.name=="'"${ZK_CONTAINER_NAME}"'")].env[?(@.name=="JVMFLAGS")].value}' \
+    | grep -q -- "-XX:-UseContainerSupport"; then
+    echo "ERROR: JVMFLAGS patch did not apply (or was reverted)." >&2
+    kubectl -n "${NAMESPACE}" get statefulset "${ZK_STS_NAME}" -o yaml | sed -n '1,200p' >&2 || true
+    exit 1
+fi
+
 
 # Delete the pod to apply the patch
 kubectl delete pod "${ZK_POD_NAME}" -n "${NAMESPACE}" --ignore-not-found=true --wait=false
 
 # Wait for the pod to become Ready
 if ! kubectl wait --for=condition=Ready "pod/${ZK_POD_NAME}" --timeout=300s -n "${NAMESPACE}"; then
     echo "ERROR: Zookeeper pod ${ZK_POD_NAME} failed to become Ready after patching."
+    # Scale operator back up before exiting
+  kubectl -n "${NAMESPACE}" scale "deployment/${ZK_OPERATOR_DEPLOYMENT}" --replicas=1
     exit 1
 fi
 
+if [[ "${ZK_OPERATOR_KEEP_SCALED_DOWN}" != "true" ]]; then
+  echo "Scaling Zookeeper operator back up..."
+  kubectl -n "${NAMESPACE}" scale "deployment/${ZK_OPERATOR_DEPLOYMENT}" --replicas=1
+else
+  echo "Leaving Zookeeper operator scaled down (ZK_OPERATOR_KEEP_SCALED_DOWN=true)."
+fi
+
 echo "Zookeeper fix applied successfully."
diff --git a/.github/scripts/end2end/install-kind-dependencies.sh b/.github/scripts/end2end/install-kind-dependencies.sh
@@ -51,11 +51,11 @@ helm repo add --force-update banzaicloud-stable https://kubernetes-charts.banzai
 		echo -n "::notice file=$(basename $0),line=$LINENO,title=Banzaicloud Charts not available::"
 		echo "Failed to add banzaicloud-stable repo, using local checkout"
 
-		kafa_operator="$(mktemp -d)"
+		kafka_operator="$(mktemp -d)"
 		git -c advice.detachedHead=false clone -q --depth 1 -b "v${KAFKA_OPERATOR_VERSION}" \
-            https://github.com/banzaicloud/koperator "${kafa_operator}"
+            https://github.com/banzaicloud/koperator "${kafka_operator}"
 
-		KAFKA_CHART="${kafa_operator}/charts/kafka-operator"
+		KAFKA_CHART="${kafka_operator}/charts/kafka-operator"
 	}
 helm repo update