Skip to content
Open
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2aec76c
feat: add suspend and resume
zhansan114514 Oct 2, 2025
bc24c3c
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Oct 4, 2025
224d2c7
fix with ai comments
zhansan114514 Oct 5, 2025
a893d51
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Oct 9, 2025
f87a16b
fix format
zhansan114514 Oct 9, 2025
378dc1e
rename task-related messages and methods to use 'job' terminology
zhansan114514 Oct 12, 2025
fb9eb76
update task management to use job terminology and status handling
zhansan114514 Oct 12, 2025
6aea2cf
update error handling for task suspension and resumption
zhansan114514 Oct 13, 2025
37ed254
add config and container support
zhansan114514 Oct 14, 2025
f2bb40e
format
zhansan114514 Oct 14, 2025
5e7fd5c
handle error fixing
zhansan114514 Oct 15, 2025
9fac2d9
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Oct 16, 2025
fca04fd
format
zhansan114514 Oct 16, 2025
724924b
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Oct 16, 2025
3ba38e5
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Oct 21, 2025
04c7ed6
Refactor job management methods in Craned and Supervisor
zhansan114514 Oct 21, 2025
9ab002e
fix
zhansan114514 Oct 21, 2025
31b7f0f
fix
zhansan114514 Oct 21, 2025
260a480
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Oct 27, 2025
4ecc292
update error handling in Suspend and Resume tasks
zhansan114514 Oct 27, 2025
36f86d2
Merge feat/suspend into upstream/master
zhansan114514 Nov 4, 2025
76c9f1b
merge
zhansan114514 Nov 4, 2025
3a53c87
Merge branch 'master' of https://github.com/PKUHPC/CraneSched into fe…
zhansan114514 Nov 4, 2025
cb1f358
delete flag
zhansan114514 Nov 4, 2025
f0f44df
remove old way and try new api
zhansan114514 Nov 5, 2025
c9019ac
fix
zhansan114514 Nov 5, 2025
b9cd9de
fix
zhansan114514 Nov 5, 2025
fa2180c
find in map but not at
zhansan114514 Nov 5, 2025
26ea975
fix wrong error code
zhansan114514 Nov 5, 2025
0f3ca65
skip daemon step
zhansan114514 Nov 5, 2025
6ffd32e
Merge branch 'master' into feat/suspend
zhansan114514 Nov 15, 2025
29201ec
fix error
zhansan114514 Nov 15, 2025
5f32f58
format
zhansan114514 Nov 15, 2025
4cb0364
fix eror
zhansan114514 Nov 15, 2025
b52209d
fix
zhansan114514 Nov 15, 2025
ab7ab1a
Merge branch 'PKUHPC:master' into feat/suspend
zhansan114514 Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions protos/Crane.proto
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,26 @@ message TerminateStepsReply {
string reason = 2;
}

message SuspendJobsRequest {
// Suspend operations are applied on job granularity.
repeated uint32 job_id_list = 1;
}

message SuspendJobsReply {
bool ok = 1;
string reason = 2;
}

message ResumeJobsRequest {
// Resume operations are applied on job granularity.
repeated uint32 job_id_list = 1;
}

message ResumeJobsReply {
bool ok = 1;
string reason = 2;
}

message TerminateOrphanedStepRequest {
map<uint32, JobStepIds> job_step_ids_map = 1;
}
Expand Down Expand Up @@ -231,6 +251,8 @@ message ModifyTaskRequest {
TimeLimit = 0;
Priority = 1;
Hold = 2;
Suspend = 3;
Resume = 4;
}

uint32 uid = 1;
Expand Down Expand Up @@ -1029,6 +1051,8 @@ service Craned {
If the task is a batch task, just kill it.
*/
rpc TerminateSteps(TerminateStepsRequest) returns (TerminateStepsReply);
rpc SuspendJobs(SuspendJobsRequest) returns (SuspendJobsReply);
rpc ResumeJobs(ResumeJobsRequest) returns (ResumeJobsReply);
rpc TerminateOrphanedStep(TerminateOrphanedStepRequest) returns (TerminateOrphanedStepReply);
rpc ChangeJobTimeLimit(ChangeJobTimeLimitRequest) returns (ChangeJobTimeLimitReply);

Expand Down
1 change: 1 addition & 0 deletions protos/PublicDefs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ enum TaskStatus {
Configuring = 7;
Configured = 8;
Completing = 9;
Suspended = 10;

Invalid = 15;
}
Expand Down
18 changes: 18 additions & 0 deletions protos/Supervisor.proto
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ message InitSupervisorRequest {
string run_cmd = 4;
string kill_cmd = 5;
string delete_cmd = 6;
string pause_cmd = 7;
string resume_cmd = 8;
}
ContainerConfig container_config = 15;

Expand Down Expand Up @@ -122,6 +124,20 @@ message TerminateTaskReply {
string reason = 2;
}

message SuspendJobRequest {}

message SuspendJobReply {
bool ok = 1;
string reason = 2;
}

message ResumeJobRequest {}

message ResumeJobReply {
bool ok = 1;
string reason = 2;
}

message ShutdownSupervisorRequest {}

message ShutdownSupervisorReply {}
Expand All @@ -132,5 +148,7 @@ service Supervisor {
rpc CheckStatus(CheckStatusRequest) returns (CheckStatusReply);
rpc ChangeTaskTimeLimit(ChangeTaskTimeLimitRequest) returns (ChangeTaskTimeLimitReply);
rpc TerminateTask(TerminateTaskRequest) returns (TerminateTaskReply);
rpc SuspendJob(SuspendJobRequest) returns (SuspendJobReply);
rpc ResumeJob(ResumeJobRequest) returns (ResumeJobReply);
rpc ShutdownSupervisor(ShutdownSupervisorRequest) returns (ShutdownSupervisorReply);
}
Loading
Loading