Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
804629a
refactor: TaskStatus logging
L-Xiafeng Jul 30, 2025
da687ad
style: auto format with clang-format.
github-actions[bot] Oct 16, 2025
ac7a21a
fix: Duplicate ctld server
L-Xiafeng Oct 16, 2025
f42a3f5
Remove duplicate debug log for freeing resource
L-Xiafeng Oct 16, 2025
bc3e582
refactor
L-Xiafeng Oct 16, 2025
9630078
fix comments
L-Xiafeng Oct 16, 2025
1a4afb3
feat: local_scheduler
NamelessOIer Sep 13, 2025
7fdedd3
fix: Lost job steps not terminated, calloc job error.
L-Xiafeng Oct 17, 2025
8efbb94
fix: Status string, remove sleep in CranedClient.cpp
L-Xiafeng Oct 17, 2025
c3c1634
refactor step db
L-Xiafeng Oct 21, 2025
290eb58
style: auto format with clang-format.
github-actions[bot] Oct 21, 2025
a47104d
fix
L-Xiafeng Oct 21, 2025
1144993
fix: update job and step insertion logic to use upsert with filters
L-Xiafeng Oct 21, 2025
e7b3ef3
style: auto format with clang-format.
github-actions[bot] Oct 21, 2025
eb44c05
update step in mongo db
L-Xiafeng Oct 21, 2025
2567321
fix: Use unique ptr for FreeStepAllocation_
L-Xiafeng Oct 21, 2025
536c4d7
fix: db mutex
L-Xiafeng Oct 22, 2025
1647800
fix: thread pool task capture
L-Xiafeng Oct 22, 2025
5701a9d
fix: TerminateOrphanedSteps
L-Xiafeng Oct 22, 2025
b47f876
fix: Handle not existed step in StepStatusChange
L-Xiafeng Oct 23, 2025
130b6db
fix: check step is null before check step id
L-Xiafeng Oct 27, 2025
dbe7af4
fix: Fix error in systemd unit file and add doc (#646)
Nativu5 Oct 20, 2025
bbcd39c
doc: Update ubuntu doc (#648)
L-Xiafeng Oct 21, 2025
349e34b
feat: Add CRI task management support
Nativu5 Jul 24, 2025
bee32c6
refactor: Refactored
Nativu5 Oct 23, 2025
00cd57b
refactor: Refactor helpers
Nativu5 Oct 23, 2025
72b83f4
fix: Fix warnings and add logs
Nativu5 Oct 23, 2025
737df87
fix: Fix CRI container termination
Nativu5 Oct 28, 2025
e37de17
fix: Fix destruct order
Nativu5 Oct 28, 2025
ca17b61
refactor: Use rich err in CRI
Nativu5 Oct 28, 2025
a823404
feat: preempt (Unstable)
NamelessOIer Oct 14, 2025
f73e1c9
allow disable & bugfix
NamelessOIer Oct 28, 2025
d41551b
feat: Add CPU binding functionality for cgroup v1 and v2
L-Xiafeng Oct 23, 2025
19d7045
bugfix
NamelessOIer Oct 29, 2025
1150f6c
refactor.
NamelessOIer Oct 30, 2025
2a22564
add cpu bind config
L-Xiafeng Oct 31, 2025
d060145
fix: priority bound
NamelessOIer Oct 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,13 @@ else ()
message(FATAL_ERROR "LibAIO was not found.")
endif ()

find_package(Subid REQUIRED)
if (Subid_FOUND)
message(STATUS "Subid found. Include: ${SUBID_INCLUDE_DIRS}; Libs: ${SUBID_LIBRARIES}")
else ()
message(FATAL_ERROR "Subid library was not found.")
endif ()

if (CRANE_ENABLE_BPF)
find_package(BPF 1.4.6 REQUIRED)
endif ()
Expand Down
53 changes: 53 additions & 0 deletions CMakeModule/FindSubid.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# FindSubid.cmake
# Exports:
# Subid_FOUND
# SUBID_INCLUDE_DIR
# SUBID_LIBRARY
# Subid::subid (IMPORTED STATIC/SHARED)

if (SUBID_INCLUDE_DIR AND SUBID_LIBRARY)
set(Subid_FIND_QUIETLY TRUE)
endif()

find_package(PkgConfig QUIET)
if (PKG_CONFIG_FOUND)
pkg_check_modules(PC_SUBID QUIET libsubid)
if (PC_SUBID_FOUND)
set(SUBID_VERSION ${PC_SUBID_VERSION})
endif()
endif()

find_path(SUBID_INCLUDE_DIR
NAMES subid.h
HINTS ${PC_SUBID_INCLUDEDIR} ${PC_SUBID_INCLUDE_DIRS}
PATH_SUFFIXES shadow
)

find_library(SUBID_LIBRARY
NAMES subid
HINTS ${PC_SUBID_LIBDIR} ${PC_SUBID_LIBRARY_DIRS}
)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Subid
FOUND_VAR Subid_FOUND
REQUIRED_VARS SUBID_LIBRARY SUBID_INCLUDE_DIR
VERSION_VAR SUBID_VERSION
)

if (Subid_FOUND AND NOT TARGET Subid::subid)
get_filename_component(_ext "${SUBID_LIBRARY}" EXT)
if (_ext STREQUAL ".a")
set(_kind STATIC)
else()
set(_kind SHARED)
endif()

add_library(Subid::subid ${_kind} IMPORTED GLOBAL)
set_target_properties(Subid::subid PROPERTIES
IMPORTED_LOCATION "${SUBID_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${SUBID_INCLUDE_DIR}"
)
endif()

mark_as_advanced(SUBID_INCLUDE_DIR SUBID_LIBRARY SUBID_VERSION)
5 changes: 0 additions & 5 deletions CodingConvention.md

This file was deleted.

19 changes: 14 additions & 5 deletions docs/deployment/backend/Ubuntu.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,17 @@ The toolchain must meet the following version requirements:
### 2.1 Install Build Tools

```bash
wget https://apt.llvm.org/llvm.sh
bash ./llvm.sh 19
apt install build-essential
apt install libmpfr-dev libgmp3-dev libmpc-dev -y
wget http://ftp.gnu.org/gnu/gcc/gcc-14.1.0/gcc-14.1.0.tar.gz
tar -xf gcc-14.1.0.tar.gz
cd gcc-14.1.0

./contrib/download_prerequisites
mkdir build && cd build
../configure --enable-checking=release --enable-languages=c,c++ --disable-multilib
make -j
make install

#For ubuntu 20.04
wget https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.sh
Expand Down Expand Up @@ -159,25 +168,25 @@ cd CraneSched
mkdir -p build && cd build

# For CGroup v1
cmake -G Ninja ..
cmake -G Ninja .. -DCMAKE_C_COMPILER=/usr/bin/gcc -DCMAKE_CXX_COMPILER=/usr/bin/g++
cmake --build .

# For CGroup v2
cmake -G Ninja .. -DCRANE_ENABLE_CGROUP_V2=true
cmake -G Ninja .. -DCRANE_ENABLE_CGROUP_V2=true -DCMAKE_C_COMPILER=/usr/bin/gcc -DCMAKE_CXX_COMPILER=/usr/bin/g++
cmake --build .
```

2. Install the built binaries:

!!! tip
We recommend deploying CraneSched using DEB packages. See the [Packaging Guide](packaging.md) for installation instructions.

```bash
cmake --install .
```

For deploying CraneSched to multiple nodes, please follow the [Multi-node Deployment Guide](../configuration/multi-node.md).


### 5.2 Configure PAM Module

PAM module configuration is optional but recommended for production clusters to control user access.
Expand Down
34 changes: 30 additions & 4 deletions docs/deployment/backend/packaging.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,24 +144,50 @@ sudo dpkg -i CraneSched-*-craned.deb
Contains files for the control node:

```
/usr/local/bin/cranectld # Control daemon binary
/etc/systemd/system/cranectld.service # Systemd service file
/usr/bin/cranectld # Control daemon binary
/usr/lib/systemd/system/cranectld.service # Systemd service file
/etc/crane/config.yaml.sample # Cluster configuration template
/etc/crane/database.yaml.sample # Database configuration template
```

!!! warning "Installation Path Differences"
The file paths differ depending on the installation method:

**When using RPM/DEB packages (`cpack`):**
- Binaries are installed to `/usr/bin/` (following FHS standard)
- Example: `/usr/bin/cranectld`

**When using direct installation (`cmake --install`):**
- Binaries are installed to `/usr/local/bin/` (default `CMAKE_INSTALL_PREFIX`)
- Example: `/usr/local/bin/cranectld`
- You can customize this with `cmake --install --prefix=/custom/path`

### craned Package

Contains files for compute nodes:

```
/usr/local/bin/craned # Execution daemon binary
/usr/bin/craned # Execution daemon binary
/usr/libexec/csupervisor # Per-step execution supervisor
/etc/systemd/system/craned.service # Systemd service file
/usr/lib/systemd/system/craned.service # Systemd service file
/etc/crane/config.yaml.sample # Cluster configuration template
/usr/lib64/security/pam_crane.so # PAM authentication module
```

!!! warning "Installation Path Differences"
The file paths differ depending on the installation method:

**When using RPM/DEB packages (`cpack`):**
- Binaries are installed to `/usr/bin/` (following FHS standard)
- Supervisor is installed to `/usr/libexec/`
- Examples: `/usr/bin/craned`, `/usr/libexec/csupervisor`

**When using direct installation (`cmake --install`):**
- Binaries are installed to `/usr/local/bin/` (default `CMAKE_INSTALL_PREFIX`)
- Supervisor is installed to `/usr/local/libexec/`
- Examples: `/usr/local/bin/craned`, `/usr/local/libexec/csupervisor`
- You can customize this with `cmake --install --prefix=/custom/path`

### Post-Installation Actions

Both packages include a post-installation script that automatically:
Expand Down
22 changes: 13 additions & 9 deletions etc/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ CraneCtldLogFile: cranectld/cranectld.log
CraneCtldMutexFilePath: cranectld/cranectld.lock
# whether the cranectld is running in the background
CraneCtldForeground: true
# whether to bind cpu to cores for job
BindCpu: true
CraneCtld:
# ping timeout in seconds
CranedTimeout: 30
Expand Down Expand Up @@ -153,14 +155,10 @@ Container:
# Toggle the container support in CraneSched
Enabled: false
# Relate to CraneBaseDir
TempDir: craned/containers/
# OCI Runtime must support `state`, `kill`, `delete`, `run` subcommands
# %b, %u, %U, %j, %x will be replaced by bundle_path, user, uid, job_id, job_name
RuntimeBin: /usr/bin/runc
RuntimeState: /usr/bin/runc --root=/run/user/%U/ state %u.%U.%j.%x
RuntimeKill: /usr/bin/runc --rootless=true --root=/run/user/%U/ kill -a %u.%U.%j.%x SIGTERM
RuntimeDelete: /usr/bin/runc --rootless=true --root=/run/user/%U/ delete --force %u.%U.%j.%x
RuntimeRun: /usr/bin/runc --rootless=true --root=/run/user/%U/ run -b %b %u.%U.%j.%x
TempDir: supervisor/containers/
# In most cases, ImageEndpoint is the same as RuntimeEndpoint
RuntimeEndpoint: /run/containerd/containerd.sock
ImageEndpoint: /run/containerd/containerd.sock

Plugin:
# Toggle the plugin module in CraneSched
Expand All @@ -171,9 +169,15 @@ Plugin:
PlugindDebugLevel: "trace"
# the address of Plugind to listen
PlugindListenAddress: 127.0.0.1
PlugindListenPort: 10018
PlugindListenPort: 10018
# Plugins to be loaded in Plugind
Plugins:
- Name: "dummy"
Path: "/path/to/dummy.so"
Config: "/path/to/dummy.yaml"

Preempt:
# PreemptType can be none, qos, partition
PreemptType: none
# PreemptMode can be OFF, CANCEL, REQUEUE, SUSPEND
PreemptMode: OFF
2 changes: 1 addition & 1 deletion etc/cranectld.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ After=network.target nss-lookup.target
[Service]
User=crane
Group=crane
ExecStart=@CMAKE_INSTALL_PREFIX@/bin/cranectld
ExecStart=@CMAKE_INSTALL_FULL_BINDIR@/cranectld

[Install]
WantedBy=multi-user.target
2 changes: 1 addition & 1 deletion etc/craned.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ After=network.target nss-lookup.target
[Service]
User=root
Group=root
ExecStart=@CMAKE_INSTALL_PREFIX@/bin/craned
ExecStart=@CMAKE_INSTALL_FULL_BINDIR@/craned

[Install]
WantedBy=multi-user.target
10 changes: 7 additions & 3 deletions protos/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# Generate proto files
set(proto_src PublicDefs.proto Crane.proto Supervisor.proto Plugin.proto)
set(GENERATED_FILE_PATH ${CMAKE_SOURCE_DIR}/generated)
file(MAKE_DIRECTORY ${GENERATED_FILE_PATH})

include(${CMAKE_SOURCE_DIR}/CMakeModule/ProtobufGenerate.cmake)

set(proto_src PublicDefs.proto Crane.proto Supervisor.proto Plugin.proto)
PROTOBUF_GENERATE_GRPC_CPP(ProtoCxxSources ProtoCxxHeaders ${GENERATED_FILE_PATH}/protos
${protobuf_SOURCE_DIR}/src ${proto_src})

get_property(protobuf_SOURCE_DIR GLOBAL PROPERTY protobuf_SOURCE_DIR)

message(STATUS "Generated proto: ${ProtoCxxSources} ${ProtoCxxHeaders}")
message(STATUS "_PROTOBUF_PROTOC: ${_PROTOBUF_PROTOC};protobuf_SOURCE_DIR: ${protobuf_SOURCE_DIR}")

add_library(crane_proto_lib STATIC
${ProtoCxxSources} ${ProtoCxxHeaders})
Expand All @@ -21,4 +22,7 @@ set_property(DIRECTORY PROPERTY ADDITIONAL_MAKE_CLEAN_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/generated/*")

target_link_libraries(crane_proto_lib absl::base absl::synchronization)
set_property(TARGET crane_proto_lib PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET crane_proto_lib PROPERTY POSITION_INDEPENDENT_CODE ON)

# Add CRI proto subdirectory
add_subdirectory(cri)
Loading
Loading