Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions src/fah/client/Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ void Config::setState(const JSON::Value &msg) {


bool Config::getOnIdle() const {return getBoolean("on_idle");}
bool Config::getDifferentIdleResources() const {
return getBoolean("different_idle_resources");
}
bool Config::getOnBattery() const {return getBoolean("on_battery");}
bool Config::getKeepAwake() const {return getBoolean("keep_awake");}

Expand Down Expand Up @@ -117,27 +120,42 @@ bool Config::getBeta(const std::set<string> &gpus) const {
}


uint32_t Config::getCPUs() const {
uint32_t Config::getCPUs(bool isIdle) const {
uint32_t maxCPUs = SystemInfo::instance().getCPUCount();
uint32_t cpus = getU32("cpus");
uint32_t cpus;

if (isIdle) {
// Try cpus_idle first, fallback to cpus if not present
cpus = has("cpus_idle") ? getU32("cpus_idle") : getU32("cpus");
} else {
cpus = getU32("cpus");
}

return maxCPUs < cpus ? maxCPUs : cpus;
}


std::set<string> Config::getGPUs() const {
std::set<string> Config::getGPUs(bool isIdle) const {
std::set<string> gpus;

for (auto &v: app.getGPUs()) {
auto &gpu = *v.cast<GPUResource>();
if (gpu.isSupported(*this)) gpus.insert(gpu.getID());
if (gpu.isSupported(*this, isIdle)) gpus.insert(gpu.getID());
}

return gpus;
}


bool Config::isGPUEnabled(const string &id) const {
auto &gpus = *get("gpus");
bool Config::isGPUEnabled(const string &id, bool isIdle) const {
// Fallback to regular gpus config if gpus_idle doesn't exist
if (isIdle && !has("gpus_idle")) {
// Recursively call with isIdle=false to get the non-idle version
return isGPUEnabled(id, false);
}

const string gpuKey = isIdle ? "gpus_idle" : "gpus";
auto &gpus = *get(gpuKey);
if (!gpus.has(id)) return false;
return gpus.get(id)->getBoolean("enabled", false);
}
Expand Down
7 changes: 4 additions & 3 deletions src/fah/client/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ namespace FAH {
void setState(const cb::JSON::Value &msg);

bool getOnIdle() const;
bool getDifferentIdleResources() const;
bool getOnBattery() const;
bool getKeepAwake() const;
void setPaused(bool paused);
Expand All @@ -63,9 +64,9 @@ namespace FAH {
uint64_t getProjectKey(const std::set<std::string> &gpus) const;
bool getBeta(const std::set<std::string> &gpus) const;

uint32_t getCPUs() const;
std::set<std::string> getGPUs() const;
bool isGPUEnabled(const std::string &id) const;
uint32_t getCPUs(bool isIdle = false) const;
std::set<std::string> getGPUs(bool isIdle = false) const;
bool isGPUEnabled(const std::string &id, bool isIdle = false) const;
bool isComputeDeviceEnabled(const std::string &type) const;
void disableGPU(const std::string &id);

Expand Down
4 changes: 2 additions & 2 deletions src/fah/client/GPUResource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ bool GPUResource::isComputeDeviceSupported(
}


bool GPUResource::isSupported(const Config &config) const {
return getBoolean("supported", false) && config.isGPUEnabled(getID()) &&
bool GPUResource::isSupported(const Config &config, bool isIdle) const {
return getBoolean("supported", false) && config.isGPUEnabled(getID(), isIdle) &&
(isComputeDeviceSupported("cuda", config) ||
isComputeDeviceSupported("hip", config) ||
isComputeDeviceSupported("opencl", config));
Expand Down
2 changes: 1 addition & 1 deletion src/fah/client/GPUResource.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace FAH {

bool isComputeDeviceSupported(
const std::string &type, const Config &config) const;
bool isSupported(const Config &config) const;
bool isSupported(const Config &config, bool isIdle = false) const;
void writeRequest(cb::JSON::Sink &sink, const Config &config) const;
};
}
Expand Down
30 changes: 23 additions & 7 deletions src/fah/client/Group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ bool Group::waitForIdle() const {
}


bool Group::useIdleResources() const {
return app.getOS().isSystemIdle() && config->getDifferentIdleResources();
}


bool Group::waitOnBattery() const {
return !config->getOnBattery() && app.getOS().isOnBattery();
}
Expand All @@ -96,11 +101,16 @@ bool Group::waitOnGPU() const {

auto supportedGPUs = config->getGPUs();
auto &gpus = *config->get("gpus");
auto &gpusIdle = *config->get("gpus_idle");

for (auto &id: gpus.keys())
if (config->isGPUEnabled(id) && !supportedGPUs.count(id))
return true;

for (auto &id: gpusIdle.keys())
if (config->isGPUEnabled(id, true) && !supportedGPUs.count(id))
return true;

return false;
}

Expand Down Expand Up @@ -215,12 +225,17 @@ void Group::update() {

// No further action if waiting
if (config->getPaused() || waitForIdle() || waitOnBattery() || waitOnGPU() ||
hasUnrunWUs() || Time::now() < waitUntil)
return event->add(0.25); // Check again later

// Allocate resources
unsigned remainingCPUs = config->getCPUs();
std::set<string> remainingGPUs = config->getGPUs();
hasUnrunWUs() || Time::now() < waitUntil)
return event->add(0.25); // Check again later

// Determine which resource configuration to use:
// - If different_idle_resources is enabled, use idle resources when idle, else use active
// - If on_idle is enabled (without different_idle_resources), always use idle resources
// but only run when system is idle
bool useIdleRes = useIdleResources();

unsigned remainingCPUs = config->getCPUs(useIdleRes);
std::set<string> remainingGPUs = config->getGPUs(useIdleRes);
std::set<string> enabledWUs;

// Allocate GPUs with minimum CPU requirements
Expand Down Expand Up @@ -290,7 +305,8 @@ void Group::update() {
}

// Add new WU if we don't already have too many and there are some resources
const unsigned maxWUs = config->getGPUs().size() + config->getCPUs() / 64 + 3;
const unsigned maxWUs = config->getGPUs(useIdleRes).size() +
config->getCPUs(useIdleRes) / 64 + 3;
if (wuCount < maxWUs && (remainingCPUs || remainingGPUs.size())) {
app.getUnits()->add(
new Unit(app, name, app.getNextWUID(), remainingCPUs, remainingGPUs));
Expand Down
1 change: 1 addition & 0 deletions src/fah/client/Group.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ namespace FAH {
void setState(const cb::JSON::Value &msg);

bool waitForIdle() const;
bool useIdleResources() const;
bool waitOnBattery() const;
bool waitOnGPU() const;
bool keepAwake() const;
Expand Down
1 change: 1 addition & 0 deletions src/fah/client/OS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,5 @@ void OS::update() {

// Update application info
app.get("info")->insertBoolean("on_battery", onBattery);
app.get("info")->insertBoolean("system_idle", idle);
}
25 changes: 14 additions & 11 deletions src/resources/group.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
{
"on_idle": false,
"on_battery": true,
"keep_awake": true,
"paused": true,
"finish": false,
"beta": false,
"cuda": true,
"hip": true,
"key": 0,
"cpus": 0,
"gpus": {}
"on_idle": false,
"different_idle_resources": false,
"on_battery": true,
"keep_awake": true,
"paused": true,
"finish": false,
"beta": false,
"cuda": true,
"hip": true,
"key": 0,
"cpus": 0,
"cpus_idle": 0,
"gpus": {},
"gpus_idle": {}
}