@@ -225,6 +225,8 @@ CommandStatus runOnKubernetes(final CommandContext context,
225225 nextStatus .set ("cluster_name" , FACTORY .textNode (client .getConfig ().getName ()));
226226 nextStatus .set ("pod_name" , FACTORY .textNode (pod .getName ()));
227227 nextStatus .set ("pod_creation_timestamp" , FACTORY .numberNode (pod .getCreationTimestamp ()));
228+ nextStatus .set ("in_temporal_config_storage_expiration" , FACTORY .numberNode (inConfigStorage .getDirectDownloadExpiration ().get ()));
229+ nextStatus .set ("out_temporal_config_storage_expiration" , FACTORY .numberNode (outConfigStorage .getDirectUploadExpiration ().get ()));
228230 nextStatus .set ("io_directory" , FACTORY .textNode (ioDirectoryPath .toString ()));
229231 nextStatus .set ("executor_state" , FACTORY .objectNode ());
230232 return createCommandStatus (pod , false , nextStatus );
@@ -256,6 +258,21 @@ CommandStatus getCommandStatusFromKubernetes(final CommandContext context,
256258 log (logMessage , clog );
257259 nextExecutorState .set ("log_offset" , FACTORY .numberNode (offset + logMessage .length ())); // update log_offset
258260 }
261+ else if (isLaunchingLongerThanInConfigStorageExpiration (previousStatusJson )) {
262+ // Throw error because launching pod time is longer than inTemporalConfigStorage expires.
263+ TaskRequest request = context .getTaskRequest ();
264+ long attemptId = request .getAttemptId ();
265+ long taskId = request .getTaskId ();
266+
267+ final String message = s ("Pod launch timeout: attempt=%d, task=%d" , attemptId , taskId );
268+ logger .warn (message );
269+
270+ logger .info (s ("Delete pod %d" , pod .getName ()));
271+ client .deletePod (pod .getName ());
272+
273+ // Throw exception to stop the task as failure
274+ throw new TaskExecutionException (message );
275+ }
259276 else { // 'waiting'
260277 // Write pod status to the command logger to avoid users confusing. For example, the container
261278 // waits starting if it will take long time to download container images.
@@ -276,7 +293,7 @@ CommandStatus getCommandStatusFromKubernetes(final CommandContext context,
276293 final InputStream in = outConfigStorage .getContentInputStream (outputArchiveKey );
277294 ProjectArchives .extractTarArchive (context .getLocalProjectPath (), in ); // runtime exception
278295 }
279- else if (defaultPodTTL .isPresent () && isRunningLongerThanTTL (previousStatusJson )) {
296+ else if (isRunningLongerThanOutConfigStorageExpiration ( previousStatusJson ) || ( defaultPodTTL .isPresent () && isRunningLongerThanTTL (previousStatusJson ) )) {
280297 TaskRequest request = context .getTaskRequest ();
281298 long attemptId = request .getAttemptId ();
282299 long taskId = request .getTaskId ();
@@ -307,6 +324,22 @@ protected List<String> setArgumentsAfterScriptCommandLine()
307324 return ImmutableList .of ();
308325 }
309326
327+ private boolean isLaunchingLongerThanInConfigStorageExpiration (final ObjectNode previousStatusJson )
328+ {
329+ long creationTimestamp = previousStatusJson .get ("pod_creation_timestamp" ).asLong ();
330+ long inTemporalConfigStorageExpiration = previousStatusJson .get ("in_temporal_config_storage_expiration" ).asLong ();
331+ long currentTimestamp = Instant .now ().getEpochSecond ();
332+ return currentTimestamp > creationTimestamp + inTemporalConfigStorageExpiration ;
333+ }
334+
335+ private boolean isRunningLongerThanOutConfigStorageExpiration (final ObjectNode previousStatusJson )
336+ {
337+ long creationTimestamp = previousStatusJson .get ("pod_creation_timestamp" ).asLong ();
338+ long outTemporalConfigStorageExpiration = previousStatusJson .get ("out_temporal_config_storage_expiration" ).asLong ();
339+ long currentTimestamp = Instant .now ().getEpochSecond ();
340+ return currentTimestamp > creationTimestamp + outTemporalConfigStorageExpiration ;
341+ }
342+
310343 private boolean isRunningLongerThanTTL (final ObjectNode previousStatusJson )
311344 {
312345 long creationTimestamp = previousStatusJson .get ("pod_creation_timestamp" ).asLong ();
0 commit comments