Skip to content

Commit a523a10

Browse files
committed
Support Wi4MPI verification, improved library mapping, and config command
- Wi4MPI: Added installation verification (ELF arch check, binary check), support for ppc64le, and better handling of missing or invalid install directories. - Launch: Added automatic binding of driver/config directories for graphics/interconnect libraries (ibverbs, OpenCL, Vulkan, etc.) to prevent driver loader errors. - CLI: Added new `config` command to list, get, and set configuration values. - Demo: Added arguments for launcher/OSU control, enabled container baseline, and forced local Wi4MPI usage.
1 parent fb5e9ba commit a523a10

File tree

4 files changed

+167
-33
lines changed

4 files changed

+167
-33
lines changed

e4s_cl/__init__.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,10 @@ def _get_e4s_cl_script():
103103
os.path.join(E4S_CL_HOME, 'system'))))
104104
"""str: System-level E4S Container Launcher files."""
105105

106-
USER_PREFIX = os.path.realpath(
107-
os.path.abspath(
108-
os.environ.get(
109-
'__E4S_CL_USER_PREFIX__',
110-
os.path.join(os.path.expanduser('~'), '.local', 'e4s_cl'))))
106+
USER_PREFIX = os.path.abspath(
107+
os.environ.get(
108+
'__E4S_CL_USER_PREFIX__',
109+
os.path.join(os.path.expanduser('~'), '.local', 'e4s_cl')))
111110
"""str: User-level E4S Container Launcher files."""
112111

113112
_CONTAINER_DIR_ENV = os.environ.get("E4S_CL_CONTAINER_DIR") or os.environ.get("E4S_CL_CONTAINER_DIRECTORY")

e4s_cl/cf/wi4mpi/install.py

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
VENDOR_BINARIES,
2727
available_compilers,
2828
)
29+
from elftools.elf.elffile import ELFFile
2930

3031
LOGGER = get_logger(__name__)
3132

@@ -157,27 +158,77 @@ def _double_tap(cmd):
157158
return not success
158159

159160

161+
def _check_wi4mpi_install(install_dir: Path) -> bool:
162+
"""
163+
Sanity check for the Wi4MPI installation
164+
"""
165+
# Check binary existence
166+
binary = install_dir / 'bin' / 'wi4mpi'
167+
if not binary.exists():
168+
LOGGER.debug("Wi4MPI binary not found at %s", binary)
169+
return False
170+
171+
# Check architecture of shares objects
172+
# Expected machine from os.uname().machine
173+
machine = os.uname().machine
174+
expected = {
175+
'x86_64': 'EM_X86_64',
176+
'amd64': 'EM_X86_64',
177+
'aarch64': 'EM_AARCH64',
178+
'ppc64le': 'EM_PPC64',
179+
'ppc64': 'EM_PPC64',
180+
}.get(machine)
181+
182+
if expected:
183+
# Check libexec shared objects
184+
lib_dir = install_dir / 'libexec' / 'wi4mpi'
185+
libraries = list(lib_dir.glob('libwi4mpi_*.so'))
186+
187+
if libraries:
188+
for lib in libraries:
189+
if not lib.is_file():
190+
continue
191+
try:
192+
with open(lib, 'rb') as f:
193+
elf = ELFFile(f)
194+
if elf.header.e_machine != expected:
195+
LOGGER.error(
196+
"Wi4MPI library %s architecture mismatch: expected %s, got %s",
197+
lib.name, expected, elf.header.e_machine)
198+
return False
199+
except Exception as err:
200+
LOGGER.debug("Failed to check architecture of %s: %s", lib,
201+
err)
202+
203+
# Check if executable runs
204+
if run_subprocess([str(binary), '-h'], discard_output=True):
205+
LOGGER.error("Wi4MPI binary at %s failed to run", binary)
206+
return False
207+
208+
return True
209+
210+
160211
def install_wi4mpi(install_dir: Path) -> Optional[Path]:
161212
"""Clones and installs wi4mpi from github releases"""
162213

163-
if os.uname().machine not in {'x86_64', 'amd64', 'aarch64'}:
214+
if os.uname().machine not in {'x86_64', 'amd64', 'aarch64', 'ppc64le'}:
164215
LOGGER.warning(
165216
"Wi4MPI not available for the following architecture: %s",
166217
os.uname().machine)
167218
return None
168219

169-
binary = install_dir / 'bin' / 'wi4mpi'
170-
if install_dir.exists() and binary.exists():
171-
LOGGER.debug(
172-
"Skipping installation for already installed Wi4MPI in %s",
173-
install_dir)
174-
return install_dir
175-
176-
if install_dir.exists() and list(install_dir.glob('*')):
177-
LOGGER.error(
178-
"Attempting Wi4MPI installation in a non-empty directory: %s",
179-
str(install_dir))
180-
return None
220+
if install_dir.exists():
221+
if _check_wi4mpi_install(install_dir):
222+
LOGGER.debug(
223+
"Skipping installation for already installed Wi4MPI in %s",
224+
install_dir)
225+
return install_dir
226+
227+
if list(install_dir.glob('*')):
228+
LOGGER.error(
229+
"Target directory %s is not empty and contains an invalid or incomplete Wi4MPI installation.",
230+
install_dir)
231+
return None
181232

182233
# Assert CMake is available
183234
cmake_executable = which("cmake")

e4s_cl/cli/commands/launch.py

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,19 @@ def _setup_wi4mpi(
144144
translation = [vendor.replace('mvapich', 'mpich') for vendor in translation]
145145

146146
# Locate the Wi4MPI installation and store it in parameters
147-
if parameters.wi4mpi is None:
147+
target_dir = parameters.wi4mpi
148+
if target_dir is None:
148149
target_dir = Path(config.CONFIGURATION.wi4mpi_install_directory)
149-
LOGGER.debug("Target: %s", target_dir)
150-
wi4mpi_install = install_wi4mpi(target_dir)
151-
if wi4mpi_install:
152-
parameters.wi4mpi = wi4mpi_install
153-
else:
154-
LOGGER.error(
155-
"Wi4MPI is required for this configuration, but installation failed"
156-
)
157-
return []
150+
151+
LOGGER.debug("Target: %s", target_dir)
152+
wi4mpi_install = install_wi4mpi(target_dir)
153+
if wi4mpi_install:
154+
parameters.wi4mpi = wi4mpi_install
155+
else:
156+
LOGGER.error(
157+
"Wi4MPI is required for this configuration, but installation failed"
158+
)
159+
return []
158160

159161
run_c_lib, run_f_lib = wi4mpi_find_libraries(family_metadata,
160162
mpi_libraries)
@@ -290,7 +292,7 @@ def _construct_parser(self):
290292
parser.add_argument(
291293
'--wi4mpi',
292294
type=arguments.posix_path,
293-
help="Path towards a Wi4MPI installation to use",
295+
help="Path to a Wi4MPI installation (will be installed there if missing)",
294296
metavar='installation',
295297
)
296298

@@ -390,6 +392,54 @@ def main(self, argv):
390392
if varname in os.environ.keys():
391393
launcher.extend(shlex.split(f"-x {varname}"))
392394

395+
# Helper: bind system configuration for libraries that use plugins/drivers
396+
# This prevents the host library from loading incompatible container drivers
397+
# by masking the container's configuration directories with the host's.
398+
399+
# Define known configuration paths for libraries that require masking
400+
system_config_dirs = ['/etc', '/usr/etc', '/usr/local/etc']
401+
402+
# Library -> Subdirectories to mask
403+
config_masks = {
404+
'libibverbs.so': ['libibverbs.d'],
405+
'libOpenCL.so': ['OpenCL'],
406+
'libvulkan.so': ['vulkan'],
407+
'libGLX.so': ['glvnd'],
408+
'libEGL.so': ['glvnd']
409+
}
410+
411+
# Check environment variables for driver paths that need directory binding
412+
# (Binding the full directory handles scanning/plugins better than individual files)
413+
driver_env_vars = {
414+
'libibverbs.so': ['IBVERBS_DRIVER_PATH']
415+
}
416+
417+
for lib in getattr(parameters, 'libraries', []):
418+
# Check for driver directories relative to lib (Standard convention)
419+
if lib.name.startswith('libibverbs.so'):
420+
verbs_dir = lib.parent / 'libibverbs'
421+
if verbs_dir.exists():
422+
parameters.files.add(verbs_dir)
423+
424+
# Check for environment-defined driver paths
425+
for prefix, vars in driver_env_vars.items():
426+
if lib.name.startswith(prefix):
427+
for var in vars:
428+
if var in os.environ:
429+
path = Path(os.environ[var])
430+
if path.exists():
431+
parameters.files.add(path)
432+
433+
# Check for configuration directories to mask
434+
for prefix, subdirs in config_masks.items():
435+
if lib.name.startswith(prefix):
436+
for conf_root in system_config_dirs:
437+
for subdir in subdirs:
438+
conf_path = Path(conf_root) / subdir
439+
if conf_path.exists():
440+
parameters.files.add(conf_path)
441+
442+
393443
execute_command = _format_execute(parameters)
394444

395445
def _supports_end_of_options(tokens: List[str]) -> bool:

scripts/demo.sh

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ E4S_CL_IMAGE_DEF=""
4141
E4S_CL_REBUILD_IMAGE="0"
4242
E4S_CL_APPTAINER_BUILD_ARGS=""
4343
E4S_CL_RUN_HOST_BASELINE="1"
44-
E4S_CL_RUN_CONTAINER_BASELINE="0"
44+
E4S_CL_RUN_CONTAINER_BASELINE="1"
4545
E4S_CL_PRUNE_INTEL_OPENCL="0"
4646
E4S_CL_SKIP_PROFILE_DETECT="0"
4747
E4S_CL_WI4MPI_CFLAGS="-Wno-error=implicit-function-declaration -Wno-error=incompatible-pointer-types -Wno-error=format -Wno-error=int-conversion -Wno-error=return-type -include string.h -include sys/time.h"
@@ -52,6 +52,8 @@ E4S_CL_HOST_MPIRUN=""
5252
E4S_CL_LAUNCHER=""
5353
E4S_CL_SCHEDULER=""
5454
E4S_CL_E4SCL_LAUNCH_ARGS=""
55+
E4S_CL_LAUNCHER_ARGS=""
56+
E4S_CL_OSU_ARGS=""
5557
E4S_CL_TIMEOUT_DURATION="60s"
5658

5759
log() { printf "[e4s-cl-test] %s\n" "$*"; }
@@ -129,7 +131,7 @@ Options:
129131
--cache-dir <path> Cache directory (default: _e4scl_cache_<tag>)
130132
--clean-workdir Delete workdir on exit (default: off)
131133
--host-baseline <on|off> Run host-only baseline (default: on)
132-
--container-baseline <on|off> Run container-only baseline (default: off)
134+
--container-baseline <on|off> Run container-only baseline (default: on)
133135
--no-prune-intel-opencl Disable workaround that prunes Intel/OpenCL libs from profile (default: pruning enabled)
134136
--skip-profile-detect Skip profile detect if profile already has bindings (default: off)
135137
--wi4mpi-cflags "..." Extra C/C++ flags for Wi4MPI build (default: relax GCC 14 errors)
@@ -139,7 +141,9 @@ Options:
139141
--host-mpirun <path> Override host mpirun/mpiexec (default: auto-detect)
140142
--launcher <cmd> Force launcher (mpirun or srun) (default: auto-detect)
141143
--scheduler <name> If "slurm", use srun when available (default: none)
142-
--e4scl-launch-args "..." Extra args for e4s-cl launch (default: none)
144+
--launcher-args "..." Extra arguments for the MPI launcher (e.g. -p partition -N 2)
145+
--osu-args "..." Override OSU benchmark arguments (e.g. "-i 1000 -m 1024:1048576")
146+
Safe for latency, bw, and allreduce.
143147
--timeout <duration> Timeout for MPI runs (default: 60s)
144148
--check Check environment prerequisites and exit
145149
-h, --help Show help
@@ -223,6 +227,8 @@ while [[ $# -gt 0 ]]; do
223227
--launcher) E4S_CL_LAUNCHER="$2"; shift 2 ;;
224228
--scheduler) E4S_CL_SCHEDULER="$2"; shift 2 ;;
225229
--e4scl-launch-args) E4S_CL_E4SCL_LAUNCH_ARGS="$2"; shift 2 ;;
230+
--launcher-args) E4S_CL_LAUNCHER_ARGS="$2"; shift 2 ;;
231+
--osu-args) E4S_CL_OSU_ARGS="$2"; shift 2 ;;
226232
--timeout) E4S_CL_TIMEOUT_DURATION="$2"; shift 2 ;;
227233
--check) E4S_CL_ONLY_CHECK="1"; shift ;;
228234
-h|--help) usage; exit 0 ;;
@@ -334,6 +340,7 @@ run_timed() {
334340
local out_file="${E4S_CL_WORKDIR}/timing.dat"
335341

336342
log "Running [${label}]..."
343+
log "Command: ${cmd[*]}"
337344
# Use python for distinct wall-clock measurement
338345
local start
339346
start=$(python3 -c 'import time; print(time.time())')
@@ -472,6 +479,11 @@ else
472479
LAUNCHER_ARGS=("-np" "${E4S_CL_MPI_PROCS}")
473480
fi
474481

482+
if [[ -n "${E4S_CL_LAUNCHER_ARGS}" ]]; then
483+
read -r -a EXTRA_LAUNCHER_ARGS <<< "${E4S_CL_LAUNCHER_ARGS}"
484+
LAUNCHER_ARGS+=("${EXTRA_LAUNCHER_ARGS[@]}")
485+
fi
486+
475487
HOST_MPI_VERSION="$(${HOST_MPIRUN} --version 2>/dev/null | head -n 2 || true)"
476488
HOST_MPI_FAMILY="$(detect_mpi_family "${HOST_MPI_VERSION}")"
477489
CONTAINER_MPI_VERSION="$(${CONTAINER_CMD} exec "${E4S_CL_IMAGE}" mpirun --version 2>/dev/null | head -n 2 || true)"
@@ -548,6 +560,12 @@ E4S_CL_BIN="${REPO_ROOT}/.venv/bin/e4s-cl"
548560

549561
if [[ "${NEEDS_TRANSLATION}" == "1" ]]; then
550562
log "Wi4MPI translation required; e4s-cl will install Wi4MPI during launch if missing"
563+
564+
# Force wi4mpi to use a local directory inside the workdir
565+
WI4MPI_LOCAL_PREFIX="${E4S_CL_WORKDIR}/wi4mpi"
566+
E4SCL_LAUNCH_ARGS+=("--wi4mpi" "${WI4MPI_LOCAL_PREFIX}")
567+
log "CONFIG: using local Wi4MPI prefix: ${WI4MPI_LOCAL_PREFIX}"
568+
551569
if [[ -n "${E4S_CL_WI4MPI_CFLAGS}" ]]; then
552570
log "CONFIG: Wi4MPI build flags set (for e4s-cl internal use): ${E4S_CL_WI4MPI_CFLAGS}"
553571
log "CONFIG: These flags will be used by e4s-cl when building Wi4MPI, not for OSU benchmarks"
@@ -625,12 +643,21 @@ printf "osu_url=%s\nosu_sha256=%s\n" "${E4S_CL_OSU_URL}" "${E4S_CL_OSU_SHA256}"
625643

626644
if [[ "${E4S_CL_MODE}" == "light" ]]; then
627645
OSU_BENCHES=("pt2pt/osu_latency" "pt2pt/osu_bw")
628-
OSU_ARGS=("-x" "10" "-i" "100" "-m" "8:1024")
646+
OSU_ARGS=("-x" "100" "-i" "1000" "-m" "8:65536")
629647
else
630648
OSU_BENCHES=("pt2pt/osu_latency" "pt2pt/osu_bw" "collective/osu_allreduce")
631649
OSU_ARGS=()
632650
fi
633651

652+
if [[ -n "${E4S_CL_OSU_ARGS}" ]]; then
653+
read -r -a OSU_ARGS <<< "${E4S_CL_OSU_ARGS}"
654+
else
655+
# Default Arguments:
656+
# - Latency: 1000 iterations to dampen startup noise
657+
# - Bandwidth: up to 1MB (1048576) is usually sufficient to see peak BW without timing out
658+
OSU_ARGS=("-x" "100" "-i" "1000" "-m" "8:1048576")
659+
fi
660+
634661
if [[ "${E4S_CL_MPI_PROCS}" != "2" ]]; then
635662
if printf '%s\n' "${OSU_BENCHES[@]}" | grep -q "pt2pt/osu_"; then
636663
log "NOTE: pt2pt OSU benchmarks require exactly 2 processes (current: ${E4S_CL_MPI_PROCS})"
@@ -730,6 +757,9 @@ if ! profile_has_bindings; then
730757
fi
731758
fi
732759

760+
log "Profile Content (Libraries/Files to be bound):"
761+
"${E4S_CL_BIN}" profile show
762+
733763
log "Step 5: Building Container Benchmarks. We compile the same benchmarks *inside* the container against the Container's MPI to demonstrate ABI compatibility or translation."
734764
OSU_CONT_PREFIX="${E4S_CL_WORKDIR}/osu-container"
735765
OSU_CONT_META="${OSU_CONT_PREFIX}/.build-meta"
@@ -750,6 +780,7 @@ if [[ "${REBUILD_CONT_OSU}" == "1" ]]; then
750780
set -euo pipefail; \
751781
unset CFLAGS; \
752782
unset CXXFLAGS; \
783+
unset CXX; \
753784
cd /work; \
754785
rm -rf osu-container-build && mkdir osu-container-build; \
755786
tar -xzf /cache/osu.tar.gz -C osu-container-build --strip-components=1; \
@@ -764,6 +795,9 @@ else
764795
fi
765796

766797
log "Step 6: MAIN TEST. Running the container-compiled binary using the Host's MPI. e4s-cl handles the library injection and launcher wrapping."
798+
log "Debug: Verifying library resolution inside container (ldd)"
799+
"${E4S_CL_BIN}" launch "${E4SCL_LAUNCH_ARGS[@]}" "${LAUNCHER_BIN}" -n 1 ldd "${OSU_CONT_PREFIX}/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency" | grep -E "libmpi|libmpich|libpami|libfabric" || echo " (ldd check finished, no obvious MPI libs found in grep filter)"
800+
767801
for bench in "${OSU_BENCHES[@]}"; do
768802
bench_name="$(basename "${bench}")"
769803
out_file="${E4S_CL_WORKDIR}/e4scl_${bench_name}.txt"

0 commit comments

Comments
 (0)