Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ All notable changes to this project will be documented in this file.
- Gracefully shutdown all concurrent tasks by forwarding the SIGTERM signal ([#651]).
- Remove the Spark application owner reference from the executor pods.
This allows Kubernetes to garbage collect them early when the driver or the submit job fail ([#648]).
- Clean up driver pods when the spark application is finished.
Previously, driver pods created by the submit job would be left hanging even after the job has been deleted ([#649]).

### Removed

Expand All @@ -27,6 +29,7 @@ All notable changes to this project will be documented in this file.
[#642]: https://github.com/stackabletech/spark-k8s-operator/pull/642
[#647]: https://github.com/stackabletech/spark-k8s-operator/pull/647
[#648]: https://github.com/stackabletech/spark-k8s-operator/pull/648
[#649]: https://github.com/stackabletech/spark-k8s-operator/pull/649
[#651]: https://github.com/stackabletech/spark-k8s-operator/pull/651

## [25.11.0] - 2025-11-07
Expand Down
24 changes: 23 additions & 1 deletion rust/operator-binary/src/pod_driver_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ pub enum Error {
InvalidPod {
source: error_boundary::InvalidObject,
},

#[snafu(display("cannot delete Spark driver pod {pod_name:?}"))]
DeleteDriverPod {
source: stackable_operator::client::Error,
pod_name: String,
},
}

type Result<T, E = Error> = std::result::Result<T, E>;
Expand All @@ -58,7 +64,10 @@ impl ReconcilerError for Error {
ErrorDiscriminants::from(self).into()
}
}
/// Updates the status of the SparkApplication that started the pod.

/// This function serves two purposes:
/// 1. It updates the status of the SparkApplication CR based on the status of the driver pod.
/// 2. It deletes the driver pod when the SparkApplication reaches a terminal state (Succeeded or Failed).
pub async fn reconcile(pod: Arc<DeserializeGuard<Pod>>, client: Arc<Client>) -> Result<Action> {
tracing::info!("Starting reconcile driver pod");

Expand Down Expand Up @@ -111,6 +120,19 @@ pub async fn reconcile(pod: Arc<DeserializeGuard<Pod>>, client: Arc<Client>) ->
name: app_name.clone(),
})?;

// We must manually delete the driver pod when the application reached a terminal state
// otherwise they are left hanging forever.
if phase == "Succeeded" || phase == "Failed" {
tracing::info!(
"Spark application {app_name:?} completed with phase {phase:?}, deleting driver pod {pod_name:?}"
);
client
.delete(pod)
.await
.with_context(|_| DeleteDriverPodSnafu {
pod_name: pod_name.clone(),
})?;
}
Ok(Action::await_change())
}

Expand Down
9 changes: 7 additions & 2 deletions rust/operator-binary/src/spark_k8s_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -633,10 +633,15 @@ fn pod_template(

let mut metadata = omb.build();

// We explicitely remove the application owner reference from executor pods.
// We explicitly remove the application owner reference from driver and executor pods.
//
// The executors then only have the driver as owner and Kubernetes can garbage collect them
// early when the driver pod or the spark-submit job is deleted.
if role == SparkApplicationRole::Executor {
// Drivers must not have the SparkApplication as owner because this prevents proper cleanup
// when the application is finished.
// The submit pod doesn't use this function right now, but we keep the "if" below for future
// sanity.
if role == SparkApplicationRole::Executor || role == SparkApplicationRole::Driver {
metadata.owner_references = None;
}

Expand Down
7 changes: 7 additions & 0 deletions tests/templates/kuttl/spark-examples/11-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
timeout: 900
commands:
# Test that there is no spark driver pod left hanging
- script: test -z "$(kubectl -n $NAMESPACE get pods -o name | grep -E 'driver$')"