diff --git a/CMakeLists.txt b/CMakeLists.txt index 46f55800d01..a270cadcd82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2024-2025 Arm Limited and/or its affiliates. +# Copyright 2024-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -614,8 +614,14 @@ install(FILES tools/cmake/executorch-config.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ExecuTorch ) -if(EXECUTORCH_BUILD_ARM_BAREMETAL) +if(EXECUTORCH_BUILD_ARM_BAREMETAL + OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX + OR EXECUTORCH_BUILD_VGF +) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +endif() + +if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) list(APPEND _executorch_backends executorch_delegate_ethos_u) endif() @@ -1063,7 +1069,6 @@ if(EXECUTORCH_BUILD_VULKAN) endif() if(EXECUTORCH_BUILD_VGF) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) list(APPEND _executorch_backends vgf_backend) endif() @@ -1197,6 +1202,21 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) endif() target_link_libraries(executor_runner ${_executor_runner_libs}) target_compile_options(executor_runner PUBLIC ${_common_compile_options}) + if(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) + target_sources( + executor_runner + PRIVATE + ${CMAKE_SOURCE_DIR}/examples/arm/executor_runner/ethosu_link_helper.cpp + ) + target_compile_definitions( + executor_runner PRIVATE EXECUTORCH_BUILD_ARM_ETHOSU_LINUX=1 + ) + # Wrap static linking like the delegate_runner to keep images + # self-contained. + target_link_options( + executor_runner PRIVATE -static-libstdc++ -static-libgcc + ) + endif() # Automatically set when using `emcmake cmake` for Wasm build. if(EMSCRIPTEN) diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index 0ffa6f172bf..a15a3d402a3 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -14,36 +14,102 @@ endif() include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +if(POLICY CMP0169) + # Allow FetchContent_Populate to be used for source-only fetch + cmake_policy(SET CMP0169 OLD) +endif() + set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 ) add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS) -# bare metal backend builds -if(EXECUTORCH_BUILD_ARM_BAREMETAL) +set(ETHOSU_LINUX_DRIVER_GIT_REPO + "https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-linux-driver-stack.git" + CACHE STRING "Git repository that hosts the Ethos-U Linux driver stack" +) +set(ETHOSU_LINUX_DRIVER_GIT_TAG + "25.11" + CACHE STRING + "Git tag/branch/commit used to fetch the Ethos-U Linux driver stack" +) +set(ETHOSU_LINUX_DRIVER_SOURCE_DIR + "" + CACHE + PATH + "Optional local path to an existing ethos-u-linux-driver stack checkout" +) + +if(EXECUTORCH_BUILD_ARM_BAREMETAL AND EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) + message( + FATAL_ERROR + "EXECUTORCH_BUILD_ARM_BAREMETAL and EXECUTORCH_BUILD_ARM_ETHOSU_LINUX cannot be enabled at the same time." + ) +endif() + +# Ethos-U backend builds (bare metal or Linux driver stack) +if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) add_compile_options("-Wall" "-Werror") - # Third-party folder and Ethos-U driver inclued set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") - set(DRIVER_ETHOSU_INCLUDE_DIR - "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include" - ) - include_directories(${DRIVER_ETHOSU_INCLUDE_DIR}) - set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp - backends/arm/runtime/VelaBinStream.cpp + set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp + backends/arm/runtime/VelaBinStream.cpp ) - list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/") + list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/") - add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources}) - target_link_libraries( - executorch_delegate_ethos_u PUBLIC executorch_core ethosu_core_driver - ) + add_library(executorch_delegate_ethos_u STATIC ${_arm_backend_sources}) + target_link_libraries(executorch_delegate_ethos_u PUBLIC executorch_core) + + if(EXECUTORCH_BUILD_ARM_BAREMETAL) + target_sources( + executorch_delegate_ethos_u + PRIVATE ${EXECUTORCH_ROOT}/backends/arm/runtime/EthosUBackend_Cortex_M.cpp + ) + set(DRIVER_ETHOSU_INCLUDE_DIR + "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include" + ) + target_include_directories( + executorch_delegate_ethos_u PRIVATE ${DRIVER_ETHOSU_INCLUDE_DIR} + ) + target_link_libraries(executorch_delegate_ethos_u PUBLIC ethosu_core_driver) + elseif(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) + target_sources( + executorch_delegate_ethos_u + PRIVATE ${EXECUTORCH_ROOT}/backends/arm/runtime/EthosUBackend_Cortex_A.cpp + ) + if(NOT ETHOSU_LINUX_DRIVER_SOURCE_DIR + OR NOT EXISTS + "${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/driver_library/src/ethosu.cpp" + ) + include(FetchContent) + FetchContent_Declare( + ethosu_linux_driver_src + GIT_REPOSITORY ${ETHOSU_LINUX_DRIVER_GIT_REPO} + GIT_TAG ${ETHOSU_LINUX_DRIVER_GIT_TAG} + GIT_SHALLOW TRUE + ) + FetchContent_GetProperties(ethosu_linux_driver_src) + if(NOT ethosu_linux_driver_src_POPULATED) + FetchContent_Populate(ethosu_linux_driver_src) + endif() + set(ETHOSU_LINUX_DRIVER_SOURCE_DIR ${ethosu_linux_driver_src_SOURCE_DIR}) + endif() + + target_include_directories( + executorch_delegate_ethos_u + PRIVATE ${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/driver_library/include + ${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/kernel/include + ) + target_sources( + executorch_delegate_ethos_u + PRIVATE ${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/driver_library/src/ethosu.cpp + ) + endif() install(TARGETS executorch_delegate_ethos_u EXPORT ExecuTorchTargets) - # end config for bare metal builds endif() # VGF backend builds diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index f7ad6242f06..71beaeacb0c 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -1,59 +1,24 @@ /* - * Copyright 2023-2025 Arm Limited and/or its affiliates. + * Copyright 2023-2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ /* - * Arm backend for Ethos-U baremetal driver stack, this relies on the - * ethos-u-core-driver for hardware interaction. + * Common Arm backend for Ethos-U. Please see + * EthosUBackend_Cortex_*.cpp for specific backends. */ -// Workaround for runtime/core/portable_type/c10/c10/util/Float16-math.h -#if defined(__GNUC__) && defined(__ZEPHYR__) -#pragma GCC diagnostic ignored "-Wdouble-promotion" -#endif - #include +#include #include #include +#include +#include +#include -#include - -#if defined(ET_EVENT_TRACER_ENABLED) -#include -#include -using executorch::runtime::EventTracer; -using executorch::runtime::EventTracerEntry; - -class EventTraceScope { - public: - EventTraceScope(EventTracer* event_tracer_, const char* name) { - event_tracer = event_tracer_; - event_tracer_entry_scope = event_tracer->start_profiling(name); - } - ~EventTraceScope() { - event_tracer->end_profiling(event_tracer_entry_scope); - } - - private: - EventTracer* event_tracer; - EventTracerEntry event_tracer_entry_scope; -}; -#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \ - EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME) -#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \ - SCOPE = EVENTTRACER->start_profiling(NAME) -#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \ - EVENTTRACER->end_profiling(SCOPE) - -#else -#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) -#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) -#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) -#endif - +#include #include #include #include @@ -77,16 +42,10 @@ using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; using executorch::runtime::Span; -#define ETHOSU_NUM_BASE_ADDRS 3 - namespace executorch { namespace backends { namespace arm { -typedef struct { - FreeableBuffer* processed; -} ExecutionHandle; - extern "C" { void __attribute__((weak)) EthosUBackend_execute_begin() {} void __attribute__((weak)) EthosUBackend_execute_end() {} @@ -135,8 +94,10 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { if (handle == nullptr) { return Error::MemoryAllocationFailed; } + handle = new (handle) ExecutionHandle(); handle->processed = processed; + handle->platform_state = platform_init(compile_specs, allocator); // Return the same buffer we were passed - this data will be // executed directly @@ -193,6 +154,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { } EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); + const int input_count = handles.inputs ? handles.inputs->count : 0; + const int output_count = handles.outputs ? handles.outputs->count : 0; + MemoryAllocator* temp_allocator = context.get_temp_allocator(); // Use a temporary allocator for the intermediate tensors of the // computation. The allocator is released in runtime/executor/method.cpp at @@ -222,7 +186,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { // Write argument values (from EValue tensor) into Ethos-U scratch // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM // or DRAM output for compatible data layouts. - for (int i = 0; i < handles.inputs->count; i++) { + for (int i = 0; i < input_count; i++) { auto tensor_count = 1, io_count = 1; auto tensor_in = args[i]->toTensor(); char* scratch_addr = ethosu_scratch + handles.inputs->io[i].offset; @@ -291,95 +255,18 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { } } - // Allocate driver handle and synchronously invoke driver - auto driver = - std::unique_ptr( - ethosu_reserve_driver(), ethosu_release_driver); - if (driver == NULL) { - ET_LOG(Error, "ethosu_reserve_driver failed"); - return Error::InvalidState; - } - - // Ethos-U low level driver expected order for Ethos U-55, we have - // constant weight data, then scratch (which contains input and output) - // scratch is written above in this function. - - uint64_t bases[ETHOSU_NUM_BASE_ADDRS] = { - static_cast( - reinterpret_cast((handles.weight_data))), - static_cast(reinterpret_cast(ethosu_scratch)), - static_cast( - reinterpret_cast(ethosu_fast_scratch))}; - size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = { - handles.weight_data_size, - handles.scratch_data_size, - ethosu_fast_scratch_size}; - int result = 0; EXECUTORCH_PROF_START( event_tracer, event_tracer_local_scope, "+EthosUBackend::execute()NPU"); - result = ethosu_invoke_v3( - driver.get(), - static_cast(handles.cmd_data), - handles.cmd_data_size, - bases, - bases_size, - ETHOSU_NUM_BASE_ADDRS, /* fixed array of pointers to binary interface*/ - nullptr); + Error platform_status = platform_execute( + context, + execution_handle, + handles, + input_count, + output_count, + args, + ethosu_scratch); EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope); - - if (result != 0) { - ET_LOG(Error, "Ethos-U invocation failed error (%d)", result); - return Error::InvalidProgram; - } - size_t tensor_bytes_total = 0; - size_t io_bytes_total = 0; - // Write outputs from scratch into EValue pointers - for (int i = 0; i < handles.outputs->count; i++) { - int tensor_count = 1, io_count = 1; - const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset; - // Process input EValue into scratch - // Outputs are in the index immediately after inputs - auto tensor_out = args[handles.inputs->count + i]->toTensor(); - - calculate_dimensions( - tensor_out, &handles.outputs->io[i], &tensor_count, &io_count); - - size_t tensor_bytes = tensor_out.nbytes(); - size_t io_bytes = static_cast(io_count) * - static_cast(handles.outputs->io[i].elem_size); - - if (tensor_bytes != io_bytes) { - Error status = copy_with_layout_adjustment( - handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes); - if (status != Error::Ok) { - return status; - } - io_bytes_total += tensor_bytes; - } else { - EXECUTORCH_PROF_SCOPE( - event_tracer, "+EthosUBackend::execute()handles.output.memcpy()"); - - memcpy( - tensor_out.mutable_data_ptr(), - static_cast(output_addr), - tensor_bytes); - io_bytes_total += io_bytes; - } - - // At times the topological order of the outputs may change. - // Lets instead ensure that the sum of output bytes match. - tensor_bytes_total += tensor_bytes; - } - if (tensor_bytes_total != io_bytes_total) { - ET_LOG(Error, "Total output tensor sizes do not match"); - ET_LOG( - Error, - "Program expects %zu bytes but got %zu", - io_bytes_total, - tensor_bytes_total); - return Error::InvalidProgram; - } - return Error::Ok; + return platform_status; } void destroy(DelegateHandle* handle) const override { @@ -387,162 +274,126 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { } private: - // Copies Vela output into the ExecuTorch tensor, adjusting for padding or - // packed layouts produced by the delegate. - Error copy_with_layout_adjustment( - const VelaIO& output_io, - int output_index, - const char* src, - executorch::aten::Tensor& tensor_out, - size_t tensor_bytes) const { - const int elem_size = output_io.elem_size; - if (elem_size == 0) { - ET_LOG( - Error, "Ethos-U output %d reports zero element size", output_index); - return Error::InvalidProgram; - } - - size_t chunk_count = 1; - for (int dim = 0; dim < shapeDim - 1; ++dim) { - const int vela_dim = output_io.shape[dim]; - chunk_count *= static_cast(vela_dim == 0 ? 1 : vela_dim); - } - const int last_dim = output_io.shape[shapeDim - 1]; - const size_t vela_chunk_elems = - static_cast(last_dim == 0 ? 1 : last_dim); - const size_t vela_chunk_size = - vela_chunk_elems * static_cast(elem_size); + // No platform-specific members. +}; - if (tensor_bytes % chunk_count != 0) { - ET_LOG( - Error, - "Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu", - output_index, - tensor_bytes, - chunk_count); - return Error::InvalidProgram; - } +Error copy_with_layout_adjustment( + const VelaIO& output_io, + int output_index, + const char* src, + executorch::aten::Tensor& tensor_out, + size_t tensor_bytes) { + const int elem_size = output_io.elem_size; + if (elem_size == 0) { + ET_LOG(Error, "Ethos-U output %d reports zero element size", output_index); + return Error::InvalidProgram; + } - const size_t chunk_size = tensor_bytes / chunk_count; + size_t chunk_count = 1; + for (int dim = 0; dim < shapeDim - 1; ++dim) { + const int vela_dim = output_io.shape[dim]; + chunk_count *= static_cast(vela_dim == 0 ? 1 : vela_dim); + } + const int last_dim = output_io.shape[shapeDim - 1]; + const size_t vela_chunk_elems = + static_cast(last_dim == 0 ? 1 : last_dim); + const size_t vela_chunk_size = + vela_chunk_elems * static_cast(elem_size); - // If Vela writes fewer bytes than the tensor expects we may need to - // expand 4-bit data to 8-bit. Ethos-U outputs may be - // packed 4-bit values but ExecuTorch tensors are at least 8-bit. - if (vela_chunk_size < chunk_size) { - if (chunk_size % vela_chunk_size != 0) { - ET_LOG( - Error, - "Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu", - output_index, - chunk_size, - vela_chunk_size); - return Error::InvalidProgram; - } + if (tensor_bytes % chunk_count != 0) { + ET_LOG( + Error, + "Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu", + output_index, + tensor_bytes, + chunk_count); + return Error::InvalidProgram; + } - const size_t expand_factor = chunk_size / vela_chunk_size; - if (expand_factor == 2 && elem_size == 1 && - tensor_out.scalar_type() == ScalarType::Char) { - return unpack_chunks_4bit_to_int8( - reinterpret_cast(src), - tensor_out.mutable_data_ptr(), - chunk_count, - chunk_size, - vela_chunk_size); - } + const size_t chunk_size = tensor_bytes / chunk_count; + // If Vela writes fewer bytes than the tensor expects we may need to + // expand 4-bit data to 8-bit. Ethos-U outputs may be + // packed 4-bit values but ExecuTorch tensors are at least 8-bit. + if (vela_chunk_size < chunk_size) { + if (chunk_size % vela_chunk_size != 0) { ET_LOG( Error, - "Ethos-U output %d expansion factor %zu with element size %d not supported", + "Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu", output_index, - expand_factor, - elem_size); + chunk_size, + vela_chunk_size); return Error::InvalidProgram; } - return strip_delegate_padding( - src, - tensor_out.mutable_data_ptr(), - chunk_count, - chunk_size, - vela_chunk_size); - } - - Error unpack_chunks_4bit_to_int8( - const uint8_t* src, - int8_t* dest, - size_t chunk_count, - size_t dest_chunk_size, - size_t src_chunk_size) const { - const uint8_t* chunk_src = src; - int8_t* chunk_dest = dest; - for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { - unpack_single_chunk_4bit_to_int8(chunk_src, chunk_dest, src_chunk_size); - chunk_src += src_chunk_size; - chunk_dest += dest_chunk_size; - } - return Error::Ok; - } - - void unpack_single_chunk_4bit_to_int8( - const uint8_t* src, - int8_t* dest, - size_t chunk_size) const { - for (size_t byte_idx = 0; byte_idx < chunk_size; ++byte_idx) { - const uint8_t packed = src[byte_idx]; - int8_t low = static_cast(packed & 0x0F); - int8_t high = static_cast((packed >> 4) & 0x0F); - if (low >= 8) { - low -= 16; - } - if (high >= 8) { - high -= 16; + const size_t expand_factor = chunk_size / vela_chunk_size; + if (expand_factor == 2 && elem_size == 1 && + tensor_out.scalar_type() == ScalarType::Char) { + const uint8_t* src_bytes = reinterpret_cast(src); + int8_t* dest = tensor_out.mutable_data_ptr(); + const uint8_t* chunk_src = src_bytes; + int8_t* chunk_dest = dest; + for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { + for (size_t byte_idx = 0; byte_idx < vela_chunk_size; ++byte_idx) { + const uint8_t packed = chunk_src[byte_idx]; + int8_t low = static_cast(packed & 0x0F); + int8_t high = static_cast((packed >> 4) & 0x0F); + if (low >= 8) { + low -= 16; + } + if (high >= 8) { + high -= 16; + } + chunk_dest[2 * byte_idx] = low; + chunk_dest[2 * byte_idx + 1] = high; + } + chunk_src += vela_chunk_size; + chunk_dest += chunk_size; } - dest[2 * byte_idx] = low; - dest[2 * byte_idx + 1] = high; + return Error::Ok; } + + ET_LOG( + Error, + "Ethos-U output %d expansion factor %zu with element size %d not supported", + output_index, + expand_factor, + elem_size); + return Error::InvalidProgram; } - Error strip_delegate_padding( - const char* src, - char* dest, - size_t chunk_count, - size_t dest_chunk_size, - size_t src_chunk_size) const { - if (dest_chunk_size > src_chunk_size) { - ET_LOG( - Error, - "dest chunk size %zu must not exceed src chunk size %zu", - dest_chunk_size, - src_chunk_size); - return Error::InvalidProgram; - } - if (src == nullptr || dest == nullptr) { - ET_LOG(Error, "Ethos-U padded copy received null buffer"); - return Error::InvalidState; - } - for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { - memcpy(dest, src, dest_chunk_size); - src += src_chunk_size; - dest += dest_chunk_size; - } - return Error::Ok; + if (src == nullptr) { + ET_LOG(Error, "Ethos-U padded copy received null buffer"); + return Error::InvalidState; + } + char* dest = tensor_out.mutable_data_ptr(); + if (dest == nullptr) { + ET_LOG(Error, "Ethos-U padded copy received null destination"); + return Error::InvalidState; + } + const char* src_bytes = src; + for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { + memcpy(dest, src_bytes, chunk_size); + src_bytes += vela_chunk_size; + dest += chunk_size; } + return Error::Ok; +} - void calculate_dimensions( - const executorch::aten::Tensor tensor, - VelaIO* io, - int* tensor_count, - int* io_count) const { - for (int i = 0; i < tensor.dim(); i++) { - *tensor_count = *tensor_count * tensor.size(i); - } +void calculate_dimensions( + const executorch::aten::Tensor tensor, + VelaIO* io, + int* tensor_count, + int* io_count) { + for (int i = 0; i < tensor.dim(); i++) { + *tensor_count = *tensor_count * tensor.size(i); + } - // The VelaIO type has a shape of fixed size 6 - for (int i = 0; i < shapeDim; i++) { - *io_count = *io_count * io->shape[i]; - } + // The VelaIO type has a shape of fixed size 6 + for (int i = 0; i < shapeDim; i++) { + *io_count = *io_count * io->shape[i]; } -}; +} namespace { auto EthosUBackend_backend = EthosUBackend(); @@ -550,7 +401,6 @@ Backend EthosUBackend_id{"EthosUBackend", &EthosUBackend_backend}; static executorch::runtime::Error EthosUBackend_registered = register_backend(EthosUBackend_id); -#ifdef __ZEPHYR__ /** * This function serves as a linker force-include mechanism to ensure the * EthosU backend module gets properly linked into the final executable, @@ -570,7 +420,6 @@ extern "C" executorch::runtime::Error executorch_delegate_EthosUBackend_registered() { return EthosUBackend_registered; } -#endif } // namespace diff --git a/backends/arm/runtime/EthosUBackend_Cortex_A.cpp b/backends/arm/runtime/EthosUBackend_Cortex_A.cpp new file mode 100644 index 00000000000..2a41fd6c037 --- /dev/null +++ b/backends/arm/runtime/EthosUBackend_Cortex_A.cpp @@ -0,0 +1,396 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Arm backend for Ethos-U Linux driver stack, this relies on the + * ethos-u-linux-driver-stack for hardware interaction. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +using executorch::runtime::ArrayRef; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::Error; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::Span; + +namespace executorch { +namespace backends { +namespace arm { + +constexpr int64_t kDefaultEthosUTimeoutNs = 60000000000LL; + +struct LinuxDriverOptions { + std::string device_path = "/dev/ethosu0"; + int64_t timeout_ns = kDefaultEthosUTimeoutNs; + bool enable_cycle_counter = true; + std::array pmu_events{}; +}; + +struct PlatformState { + LinuxDriverOptions options; +}; + +namespace { + +template +bool read_scalar_value(const CompileSpec& spec, T* out) { + if (spec.value.buffer == nullptr || spec.value.nbytes != sizeof(T)) { + return false; + } + std::memcpy(out, spec.value.buffer, sizeof(T)); + return true; +} + +std::string read_string_value(const CompileSpec& spec) { + if (spec.value.buffer == nullptr || spec.value.nbytes == 0) { + return ""; + } + const char* raw_begin = static_cast(spec.value.buffer); + const char* raw_end = raw_begin + spec.value.nbytes; + std::string result(raw_begin, raw_end); + while (!result.empty() && result.back() == '\0') { + result.pop_back(); + } + return result; +} + +LinuxDriverOptions parse_linux_options(ArrayRef specs) { + LinuxDriverOptions options; + constexpr char kDeviceKey[] = "ethosu.device"; + constexpr char kTimeoutKey[] = "ethosu.timeout_ns"; + constexpr char kCycleCounterKey[] = "ethosu.enable_cycle_counter"; + constexpr char kPmuPrefix[] = "ethosu.pmu_event"; + + for (const CompileSpec& spec : specs) { + if (spec.key == nullptr) { + continue; + } + + if (strcmp(spec.key, kDeviceKey) == 0) { + std::string device_path = read_string_value(spec); + if (!device_path.empty()) { + options.device_path = device_path; + } + continue; + } + + if (strcmp(spec.key, kTimeoutKey) == 0) { + int64_t timeout = 0; + if (read_scalar_value(spec, &timeout) && timeout > 0) { + options.timeout_ns = timeout; + } + continue; + } + + if (strcmp(spec.key, kCycleCounterKey) == 0) { + uint8_t enabled = 0; + if (read_scalar_value(spec, &enabled)) { + options.enable_cycle_counter = enabled != 0; + } + continue; + } + + if (strncmp(spec.key, kPmuPrefix, strlen(kPmuPrefix)) == 0) { + const char* index_str = spec.key + strlen(kPmuPrefix); + char* endptr = nullptr; + long idx = std::strtol(index_str, &endptr, 10); + if (endptr != index_str && idx >= 0 && + idx < static_cast(ETHOSU_PMU_EVENT_MAX)) { + uint32_t event = 0; + if (read_scalar_value(spec, &event)) { + options.pmu_events[static_cast(idx)] = event; + } + } + } + } + + return options; +} + +class EthosULinuxDeviceCache { + public: + EthosU::Device& get(const std::string& device_path) { + std::lock_guard lock(mutex_); + if (!device_ || device_path != active_path_) { + device_ = std::make_unique(device_path.c_str()); + active_path_ = device_path; + } + return *device_; + } + + private: + std::mutex mutex_; + std::string active_path_; + std::unique_ptr device_; +}; + +EthosULinuxDeviceCache& get_linux_device_cache() { + static EthosULinuxDeviceCache cache; + return cache; +} + +const char* inference_status_to_string(EthosU::InferenceStatus status) { + switch (status) { + case EthosU::InferenceStatus::OK: + return "OK"; + case EthosU::InferenceStatus::ERROR: + return "ERROR"; + case EthosU::InferenceStatus::RUNNING: + return "RUNNING"; + case EthosU::InferenceStatus::REJECTED: + return "REJECTED"; + case EthosU::InferenceStatus::ABORTED: + return "ABORTED"; + case EthosU::InferenceStatus::ABORTING: + return "ABORTING"; + case EthosU::InferenceStatus::PENDING: + return "PENDING"; + } + return "UNKNOWN"; +} + +Error invoke_linux_driver( + const VelaHandles& handles, + const std::vector& input_ptrs, + const std::vector& output_ptrs, + const std::vector& input_copy_sizes, + const std::vector& output_copy_sizes, + const LinuxDriverOptions& options) { + if (handles.outputs == nullptr) { + ET_LOG(Error, "Ethos-U backend missing output metadata"); + return Error::InvalidProgram; + } + + try { + EthosU::Device& device = get_linux_device_cache().get(options.device_path); + auto network = std::make_shared( + device, + reinterpret_cast(handles.cmd_data), + handles.cmd_data_size); + + std::shared_ptr constant_buffer = + std::make_shared(); + if (handles.weight_data_size > 0) { + auto constant_buffers = device.createBuffers({handles.weight_data_size}); + constant_buffer = constant_buffers.front(); + constant_buffer->write( + const_cast(handles.weight_data), handles.weight_data_size); + } + + std::shared_ptr intermediate_buffer = + std::make_shared(); + if (handles.scratch_data_size > 0) { + auto scratch_buffers = device.createBuffers({handles.scratch_data_size}); + intermediate_buffer = scratch_buffers.front(); + } + + std::vector> ifm_buffers; + if (handles.inputs != nullptr && handles.inputs->count > 0) { + if (input_copy_sizes.size() != + static_cast(handles.inputs->count)) { + ET_LOG( + Error, + "Mismatch between input metadata (%d) and copy plan (%zu)", + handles.inputs->count, + input_copy_sizes.size()); + return Error::InvalidProgram; + } + if (input_ptrs.size() != input_copy_sizes.size()) { + ET_LOG( + Error, + "Mismatch between input metadata and runtime pointers (%zu vs %zu)", + input_ptrs.size(), + input_copy_sizes.size()); + return Error::InvalidState; + } + ifm_buffers = device.createBuffers(input_copy_sizes); + for (int i = 0; i < handles.inputs->count; ++i) { + const size_t copy_size = input_copy_sizes[i]; + if (copy_size == 0) { + continue; + } + const char* src = input_ptrs[i]; + if (src == nullptr) { + ET_LOG(Error, "Missing input buffer for index %d", i); + return Error::InvalidState; + } + ifm_buffers[i]->write(const_cast(src), copy_size); + } + } + + if (output_copy_sizes.size() != + static_cast(handles.outputs->count)) { + ET_LOG( + Error, + "Mismatch between output metadata (%d) and copy plan (%zu)", + handles.outputs->count, + output_copy_sizes.size()); + return Error::InvalidProgram; + } + if (output_ptrs.size() != output_copy_sizes.size()) { + ET_LOG( + Error, + "Mismatch between output metadata and runtime buffers (%zu vs %zu)", + output_ptrs.size(), + output_copy_sizes.size()); + return Error::InvalidState; + } + auto ofm_buffers = device.createBuffers(output_copy_sizes); + + auto inference = std::make_unique( + network, + ifm_buffers.begin(), + ifm_buffers.end(), + ofm_buffers.begin(), + ofm_buffers.end(), + intermediate_buffer, + constant_buffer, + options.pmu_events, + options.enable_cycle_counter); + + if (inference->wait(options.timeout_ns)) { + ET_LOG( + Error, + "Ethos-U inference timed out after %lld ns", + static_cast(options.timeout_ns)); + return Error::InvalidState; + } + + auto status = inference->status(); + if (status != EthosU::InferenceStatus::OK) { + ET_LOG( + Error, + "Ethos-U inference failed with status %s", + inference_status_to_string(status)); + return Error::InvalidState; + } + + if (options.enable_cycle_counter) { + try { + uint64_t cycles = inference->getCycleCounter(); + ET_LOG( + Info, + "Ethos-U Linux delegate cycle counter: %llu", + static_cast(cycles)); + } catch (const std::exception& e) { + ET_LOG(Debug, "Failed to read Ethos-U cycle counter: %s", e.what()); + } + } + + for (int i = 0; i < handles.outputs->count; ++i) { + const size_t copy_size = output_copy_sizes[i]; + if (copy_size == 0) { + continue; + } + char* dst = output_ptrs[i]; + if (dst == nullptr) { + ET_LOG(Error, "Missing output buffer for index %d", i); + return Error::InvalidState; + } + ofm_buffers[i]->read(dst, copy_size); + } + } catch (const std::exception& e) { + ET_LOG(Error, "Ethos-U Linux driver invocation failed: %s", e.what()); + return Error::InvalidState; + } + + return Error::Ok; +} +} // namespace + +PlatformState* platform_init( + ArrayRef specs, + MemoryAllocator* allocator) { + if (allocator == nullptr) { + return nullptr; + } + PlatformState* state = allocator->allocateInstance(); + if (state == nullptr) { + return nullptr; + } + state = new (state) PlatformState(); + state->options = parse_linux_options(specs); + return state; +} + +Error platform_execute( + BackendExecutionContext& /*context*/, + const ExecutionHandle* execution_handle, + const VelaHandles& handles, + int input_count, + int output_count, + Span args, + char* /*ethosu_scratch*/) { + std::vector input_copy_sizes; + std::vector linux_input_ptrs; + if (input_count > 0) { + input_copy_sizes.resize(input_count, 0); + linux_input_ptrs.resize(input_count, nullptr); + } + + std::vector output_io_bytes; + std::vector linux_output_ptrs; + if (output_count > 0) { + output_io_bytes.resize(output_count, 0); + linux_output_ptrs.resize(output_count, nullptr); + } + + for (int i = 0; i < input_count; ++i) { + auto tensor_in = args[i]->toTensor(); + linux_input_ptrs[i] = tensor_in.mutable_data_ptr(); + input_copy_sizes[i] = tensor_in.nbytes(); + } + + if (handles.outputs != nullptr) { + for (int i = 0; i < output_count; ++i) { + int tensor_count = 1, io_count = 1; + auto tensor_out = args[input_count + i]->toTensor(); + calculate_dimensions( + tensor_out, &handles.outputs->io[i], &tensor_count, &io_count); + if (i < static_cast(output_io_bytes.size())) { + output_io_bytes[i] = static_cast(io_count) * + static_cast(handles.outputs->io[i].elem_size); + } + linux_output_ptrs[i] = tensor_out.mutable_data_ptr(); + } + } + + const PlatformState* state = execution_handle->platform_state; + if (state == nullptr) { + ET_LOG(Error, "Ethos-U Linux backend missing platform state"); + return Error::InvalidState; + } + + return invoke_linux_driver( + handles, + linux_input_ptrs, + linux_output_ptrs, + input_copy_sizes, + output_io_bytes, + state->options); +} + +} // namespace arm +} // namespace backends +} // namespace executorch diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp new file mode 100644 index 00000000000..7e6e9f5efaf --- /dev/null +++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp @@ -0,0 +1,130 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Arm backend for Ethos-U baremetal driver stack, this relies on the + * ethos-u-core-driver for hardware interaction. + */ + +#include +#include +#include + +#include + +#include +#include + +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::Error; +using executorch::runtime::Span; + +namespace executorch { +namespace backends { +namespace arm { + +struct PlatformState {}; + +PlatformState* platform_init( + executorch::runtime::ArrayRef /*specs*/, + executorch::runtime::MemoryAllocator* /*allocator*/) { + return nullptr; +} + +Error platform_execute( + BackendExecutionContext& /*context*/, + const ExecutionHandle* /*execution_handle*/, + const VelaHandles& handles, + int input_count, + int output_count, + Span args, + char* ethosu_scratch) { + // Allocate driver handle and synchronously invoke driver + auto driver = + std::unique_ptr( + ethosu_reserve_driver(), ethosu_release_driver); + if (driver == nullptr) { + ET_LOG(Error, "ethosu_reserve_driver failed"); + return Error::InvalidState; + } + + // Ethos-U low level driver expected order for Ethos U-55, we have + // constant weight data, then scratch (which contains input and output) + // scratch is written above in this function. + uint64_t bases[ETHOSU_NUM_BASE_ADDRS] = { + static_cast(reinterpret_cast((handles.weight_data))), + static_cast(reinterpret_cast(ethosu_scratch)), + static_cast(reinterpret_cast(ethosu_fast_scratch))}; + size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = { + handles.weight_data_size, + handles.scratch_data_size, + ethosu_fast_scratch_size}; + int result = ethosu_invoke_v3( + driver.get(), + static_cast(handles.cmd_data), + handles.cmd_data_size, + bases, + bases_size, + ETHOSU_NUM_BASE_ADDRS, /* fixed array of pointers to binary interface*/ + nullptr); + + if (result != 0) { + ET_LOG(Error, "Ethos-U invocation failed error (%d)", result); + return Error::InvalidProgram; + } + + size_t tensor_bytes_total = 0; + size_t io_bytes_total = 0; + // Write outputs from scratch into EValue pointers + for (int i = 0; i < output_count; i++) { + int tensor_count = 1, io_count = 1; + const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset; + // Process input EValue into scratch + // Outputs are in the index immediately after inputs + auto tensor_out = args[input_count + i]->toTensor(); + + calculate_dimensions( + tensor_out, &handles.outputs->io[i], &tensor_count, &io_count); + + size_t tensor_bytes = tensor_out.nbytes(); + size_t io_bytes = static_cast(io_count) * + static_cast(handles.outputs->io[i].elem_size); + + if (tensor_bytes != io_bytes) { + Error status = copy_with_layout_adjustment( + handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes); + if (status != Error::Ok) { + return status; + } + io_bytes_total += tensor_bytes; + } else { + memcpy( + tensor_out.mutable_data_ptr(), + static_cast(output_addr), + tensor_bytes); + io_bytes_total += io_bytes; + } + + // At times the topological order of the outputs may change. + // Lets instead ensure that the sum of output bytes match. + tensor_bytes_total += tensor_bytes; + } + if (tensor_bytes_total != io_bytes_total) { + ET_LOG(Error, "Total output tensor sizes do not match"); + ET_LOG( + Error, + "Program expects %zu bytes but got %zu", + io_bytes_total, + tensor_bytes_total); + return Error::InvalidProgram; + } + return Error::Ok; +} + +} // namespace arm +} // namespace backends +} // namespace executorch diff --git a/backends/arm/runtime/EthosUBackend_Internal.h b/backends/arm/runtime/EthosUBackend_Internal.h new file mode 100644 index 00000000000..d1e543f07df --- /dev/null +++ b/backends/arm/runtime/EthosUBackend_Internal.h @@ -0,0 +1,107 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// Workaround for runtime/core/portable_type/c10/c10/util/Float16-math.h +#if defined(__GNUC__) && defined(__ZEPHYR__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif + +#include +#include + +#include +#include +#include +#include + +#if defined(__GNUC__) && defined(__ZEPHYR__) +#pragma GCC diagnostic pop +#endif + +#if defined(ET_EVENT_TRACER_ENABLED) +#include +#include +using executorch::runtime::EventTracer; +using executorch::runtime::EventTracerEntry; + +class EventTraceScope { + public: + EventTraceScope(EventTracer* event_tracer_, const char* name) { + event_tracer = event_tracer_; + event_tracer_entry_scope = event_tracer->start_profiling(name); + } + ~EventTraceScope() { + event_tracer->end_profiling(event_tracer_entry_scope); + } + + private: + EventTracer* event_tracer; + EventTracerEntry event_tracer_entry_scope; +}; +#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \ + EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME) +#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \ + SCOPE = EVENTTRACER->start_profiling(NAME) +#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \ + EVENTTRACER->end_profiling(SCOPE) +#else +#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) +#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) +#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) +#endif + +#define ETHOSU_NUM_BASE_ADDRS 3 + +namespace executorch { +namespace backends { +namespace arm { + +struct PlatformState; + +struct ExecutionHandle { + executorch::runtime::FreeableBuffer* processed; + PlatformState* platform_state; +}; + +extern "C" { +void EthosUBackend_execute_begin(); +void EthosUBackend_execute_end(); +extern unsigned char* ethosu_fast_scratch; +extern size_t ethosu_fast_scratch_size; +} + +PlatformState* platform_init( + executorch::runtime::ArrayRef specs, + executorch::runtime::MemoryAllocator* allocator); +executorch::runtime::Error platform_execute( + executorch::runtime::BackendExecutionContext& context, + const ExecutionHandle* execution_handle, + const VelaHandles& handles, + int input_count, + int output_count, + executorch::runtime::Span args, + char* ethosu_scratch); + +executorch::runtime::Error copy_with_layout_adjustment( + const VelaIO& output_io, + int output_index, + const char* src, + executorch::aten::Tensor& tensor_out, + size_t tensor_bytes); + +void calculate_dimensions( + const executorch::aten::Tensor tensor, + VelaIO* io, + int* tensor_count, + int* io_count); + +} // namespace arm +} // namespace backends +} // namespace executorch diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp index c8d568499c9..70c5b0c7666 100644 --- a/backends/arm/runtime/VelaBinStream.cpp +++ b/backends/arm/runtime/VelaBinStream.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2023, 2025 Arm Limited and/or its affiliates. + * Copyright 2023, 2025-2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -65,8 +65,10 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size) { return false; } else if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) { // This driver magic header confirms a valid command stream in binary - if (strncmp(b->data, "COP1", strlen("COP1"))) + if (strncmp(b->data, "COP1", strlen("COP1")) && + strncmp(b->data, "COP2", strlen("COP2"))) { return false; + } handles->cmd_data = b->data; handles->cmd_data_size = b->size; } else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) { diff --git a/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake b/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake new file mode 100644 index 00000000000..e4b8af62067 --- /dev/null +++ b/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake @@ -0,0 +1,79 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Minimum version kept low so this file can be used even if the global project +# requires a newer CMake. +cmake_minimum_required(VERSION 3.20) + +# Toolchain root for the standalone aarch64-linux-musl cross compiler. +set(MUSL_TOOLCHAIN_ROOT + "" + CACHE PATH "Root of the aarch64-linux-musl toolchain" +) +if(MUSL_TOOLCHAIN_ROOT STREQUAL "" AND DEFINED ENV{MUSL_TOOLCHAIN_ROOT}) + set(MUSL_TOOLCHAIN_ROOT "$ENV{MUSL_TOOLCHAIN_ROOT}") +endif() +if(MUSL_TOOLCHAIN_ROOT STREQUAL "") + message( + FATAL_ERROR + "MUSL_TOOLCHAIN_ROOT is required (e.g. -DMUSL_TOOLCHAIN_ROOT=/path/to/aarch64-linux-musl-cross or export MUSL_TOOLCHAIN_ROOT=...)" + ) +endif() + +# Ensure the toolchain root is forwarded to try_compile checks. +set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES MUSL_TOOLCHAIN_ROOT) +set(_MUSL_SYSROOT "${MUSL_TOOLCHAIN_ROOT}/aarch64-linux-musl") +set(_MUSL_BIN_DIR "${MUSL_TOOLCHAIN_ROOT}/bin") + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR aarch64) + +set(CMAKE_SYSROOT + "${_MUSL_SYSROOT}" + CACHE PATH "Musl target sysroot" +) + +set(CMAKE_C_COMPILER + "${_MUSL_BIN_DIR}/aarch64-linux-musl-gcc" + CACHE FILEPATH "Musl cross C compiler" +) +set(CMAKE_CXX_COMPILER + "${_MUSL_BIN_DIR}/aarch64-linux-musl-g++" + CACHE FILEPATH "Musl cross C++ compiler" +) +set(CMAKE_AR + "${_MUSL_BIN_DIR}/aarch64-linux-musl-ar" + CACHE FILEPATH "Musl archiver" +) +set(CMAKE_RANLIB + "${_MUSL_BIN_DIR}/aarch64-linux-musl-ranlib" + CACHE FILEPATH "Musl ranlib" +) +set(CMAKE_STRIP + "${_MUSL_BIN_DIR}/aarch64-linux-musl-strip" + CACHE FILEPATH "Musl strip" +) + +set(CMAKE_FIND_ROOT_PATH "${CMAKE_SYSROOT}") +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) + +if(DEFINED ENV{PKG_CONFIG_SYSROOT_DIR}) + set(ENV{PKG_CONFIG_SYSROOT_DIR} $ENV{PKG_CONFIG_SYSROOT_DIR}) +else() + set(ENV{PKG_CONFIG_SYSROOT_DIR} ${CMAKE_SYSROOT}) +endif() + +if(DEFINED ENV{PKG_CONFIG_PATH}) + set(ENV{PKG_CONFIG_PATH} + "${CMAKE_SYSROOT}/usr/lib/pkgconfig:${CMAKE_SYSROOT}/usr/share/pkgconfig:$ENV{PKG_CONFIG_PATH}" + ) +else() + set(ENV{PKG_CONFIG_PATH} + "${CMAKE_SYSROOT}/usr/lib/pkgconfig:${CMAKE_SYSROOT}/usr/share/pkgconfig" + ) +endif() diff --git a/examples/arm/executor_runner/ethosu_link_helper.cpp b/examples/arm/executor_runner/ethosu_link_helper.cpp new file mode 100644 index 00000000000..3130dfbd78b --- /dev/null +++ b/examples/arm/executor_runner/ethosu_link_helper.cpp @@ -0,0 +1,26 @@ +/* + * Copyright 2026 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Helper to force-link the Ethos-U backend when building the portable runner. + +#if defined(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) +#include + +extern "C" ::executorch::runtime::Error +executorch_delegate_EthosUBackend_registered(); + +namespace { +struct EthosULinkHook { + EthosULinkHook() { + // Force linker to keep the Ethos-U backend object file. + (void)executorch_delegate_EthosUBackend_registered(); + } +}; + +static EthosULinkHook g_link_hook; +} // namespace +#endif // EXECUTORCH_BUILD_ARM_ETHOSU_LINUX diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 9a16f1ae4f4..5a7d777580c 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2025 Arm Limited and/or its affiliates. +# Copyright 2025-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -63,6 +63,10 @@ define_overridable_option( EXECUTORCH_BUILD_ARM_BAREMETAL "Build the Arm Baremetal flow for Cortex-M and Ethos-U" BOOL OFF ) +define_overridable_option( + EXECUTORCH_BUILD_ARM_ETHOSU_LINUX + "Build the Arm Ethos-U backend for the Linux driver stack" BOOL OFF +) define_overridable_option( EXECUTORCH_BUILD_KERNELS_LLM "Build the custom kernels" BOOL OFF ) @@ -232,6 +236,11 @@ check_conflicting_options_on( EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES ) +check_conflicting_options_on( + IF_ON EXECUTORCH_BUILD_ARM_ETHOSU_LINUX CONFLICTS_WITH + EXECUTORCH_BUILD_ARM_BAREMETAL +) + # TODO(jathu): move this to platform specific presets when created set(_default_executorch_build_executor_runner ON) if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")