diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46f55800d01..a270cadcd82 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -614,8 +614,14 @@ install(FILES tools/cmake/executorch-config.cmake
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ExecuTorch
 )
 
-if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+if(EXECUTORCH_BUILD_ARM_BAREMETAL
+   OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX
+   OR EXECUTORCH_BUILD_VGF
+)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
+if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
   list(APPEND _executorch_backends executorch_delegate_ethos_u)
 endif()
 
@@ -1063,7 +1069,6 @@ if(EXECUTORCH_BUILD_VULKAN)
 endif()
 
 if(EXECUTORCH_BUILD_VGF)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
   list(APPEND _executorch_backends vgf_backend)
 endif()
 
@@ -1197,6 +1202,21 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   endif()
   target_link_libraries(executor_runner ${_executor_runner_libs})
   target_compile_options(executor_runner PUBLIC ${_common_compile_options})
+  if(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
+    target_sources(
+      executor_runner
+      PRIVATE
+        ${CMAKE_SOURCE_DIR}/examples/arm/executor_runner/ethosu_link_helper.cpp
+    )
+    target_compile_definitions(
+      executor_runner PRIVATE EXECUTORCH_BUILD_ARM_ETHOSU_LINUX=1
+    )
+    # Wrap static linking like the delegate_runner to keep images
+    # self-contained.
+    target_link_options(
+      executor_runner PRIVATE -static-libstdc++ -static-libgcc
+    )
+  endif()
 
   # Automatically set when using `emcmake cmake` for Wasm build.
   if(EMSCRIPTEN)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 0ffa6f172bf..a15a3d402a3 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -14,36 +14,102 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
+if(POLICY CMP0169)
+  # Allow FetchContent_Populate to be used for source-only fetch
+  cmake_policy(SET CMP0169 OLD)
+endif()
+
 set(_common_include_directories
     ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
 )
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
-# bare metal backend builds
-if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+set(ETHOSU_LINUX_DRIVER_GIT_REPO
+    "https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-linux-driver-stack.git"
+    CACHE STRING "Git repository that hosts the Ethos-U Linux driver stack"
+)
+set(ETHOSU_LINUX_DRIVER_GIT_TAG
+    "25.11"
+    CACHE STRING
+          "Git tag/branch/commit used to fetch the Ethos-U Linux driver stack"
+)
+set(ETHOSU_LINUX_DRIVER_SOURCE_DIR
+    ""
+    CACHE
+      PATH
+      "Optional local path to an existing ethos-u-linux-driver stack checkout"
+)
+
+if(EXECUTORCH_BUILD_ARM_BAREMETAL AND EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
+  message(
+    FATAL_ERROR
+      "EXECUTORCH_BUILD_ARM_BAREMETAL and EXECUTORCH_BUILD_ARM_ETHOSU_LINUX cannot be enabled at the same time."
+  )
+endif()
+
+# Ethos-U backend builds (bare metal or Linux driver stack)
+if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
 
   add_compile_options("-Wall" "-Werror")
 
-  # Third-party folder and Ethos-U driver inclued
   set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
-  set(DRIVER_ETHOSU_INCLUDE_DIR
-      "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include"
-  )
-  include_directories(${DRIVER_ETHOSU_INCLUDE_DIR})
 
-  set(_arm_baremetal_sources backends/arm/runtime/EthosUBackend.cpp
-                             backends/arm/runtime/VelaBinStream.cpp
+  set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp
+                           backends/arm/runtime/VelaBinStream.cpp
   )
-  list(TRANSFORM _arm_baremetal_sources PREPEND "${EXECUTORCH_ROOT}/")
+  list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
 
-  add_library(executorch_delegate_ethos_u STATIC ${_arm_baremetal_sources})
-  target_link_libraries(
-    executorch_delegate_ethos_u PUBLIC executorch_core ethosu_core_driver
-  )
+  add_library(executorch_delegate_ethos_u STATIC ${_arm_backend_sources})
+  target_link_libraries(executorch_delegate_ethos_u PUBLIC executorch_core)
+
+  if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+    target_sources(
+      executorch_delegate_ethos_u
+      PRIVATE ${EXECUTORCH_ROOT}/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
+    )
+    set(DRIVER_ETHOSU_INCLUDE_DIR
+        "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include"
+    )
+    target_include_directories(
+      executorch_delegate_ethos_u PRIVATE ${DRIVER_ETHOSU_INCLUDE_DIR}
+    )
+    target_link_libraries(executorch_delegate_ethos_u PUBLIC ethosu_core_driver)
+  elseif(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
+    target_sources(
+      executorch_delegate_ethos_u
+      PRIVATE ${EXECUTORCH_ROOT}/backends/arm/runtime/EthosUBackend_Cortex_A.cpp
+    )
+    if(NOT ETHOSU_LINUX_DRIVER_SOURCE_DIR
+       OR NOT EXISTS
+          "${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/driver_library/src/ethosu.cpp"
+    )
+      include(FetchContent)
+      FetchContent_Declare(
+        ethosu_linux_driver_src
+        GIT_REPOSITORY ${ETHOSU_LINUX_DRIVER_GIT_REPO}
+        GIT_TAG ${ETHOSU_LINUX_DRIVER_GIT_TAG}
+        GIT_SHALLOW TRUE
+      )
+      FetchContent_GetProperties(ethosu_linux_driver_src)
+      if(NOT ethosu_linux_driver_src_POPULATED)
+        FetchContent_Populate(ethosu_linux_driver_src)
+      endif()
+      set(ETHOSU_LINUX_DRIVER_SOURCE_DIR ${ethosu_linux_driver_src_SOURCE_DIR})
+    endif()
+
+    target_include_directories(
+      executorch_delegate_ethos_u
+      PRIVATE ${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/driver_library/include
+              ${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/kernel/include
+    )
+    target_sources(
+      executorch_delegate_ethos_u
+      PRIVATE ${ETHOSU_LINUX_DRIVER_SOURCE_DIR}/driver_library/src/ethosu.cpp
+    )
+  endif()
 
   install(TARGETS executorch_delegate_ethos_u EXPORT ExecuTorchTargets)
 
-  # end config for bare metal builds
 endif()
 
 # VGF backend builds
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index f7ad6242f06..71beaeacb0c 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -1,59 +1,24 @@
 /*
- * Copyright 2023-2025 Arm Limited and/or its affiliates.
+ * Copyright 2023-2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
 /*
- * Arm backend for Ethos-U baremetal driver stack, this relies on the
- * ethos-u-core-driver for hardware interaction.
+ * Common Arm backend for Ethos-U. Please see
+ * EthosUBackend_Cortex_*.cpp for specific backends.
  */
 
-// Workaround for runtime/core/portable_type/c10/c10/util/Float16-math.h
-#if defined(__GNUC__) && defined(__ZEPHYR__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <new>
+#include <string>
+#include <vector>
 
-#include <ethosu_driver.h>
-
-#if defined(ET_EVENT_TRACER_ENABLED)
-#include <executorch/runtime/core/event_tracer.h>
-#include <executorch/runtime/core/event_tracer_hooks.h>
-using executorch::runtime::EventTracer;
-using executorch::runtime::EventTracerEntry;
-
-class EventTraceScope {
- public:
-  EventTraceScope(EventTracer* event_tracer_, const char* name) {
-    event_tracer = event_tracer_;
-    event_tracer_entry_scope = event_tracer->start_profiling(name);
-  }
-  ~EventTraceScope() {
-    event_tracer->end_profiling(event_tracer_entry_scope);
-  }
-
- private:
-  EventTracer* event_tracer;
-  EventTracerEntry event_tracer_entry_scope;
-};
-#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \
-  EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
-#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \
-  SCOPE = EVENTTRACER->start_profiling(NAME)
-#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \
-  EVENTTRACER->end_profiling(SCOPE)
-
-#else
-#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME)
-#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME)
-#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE)
-#endif
-
+#include <executorch/backends/arm/runtime/EthosUBackend_Internal.h>
 #include <executorch/backends/arm/runtime/VelaBinStream.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -77,16 +42,10 @@ using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 
-#define ETHOSU_NUM_BASE_ADDRS 3
-
 namespace executorch {
 namespace backends {
 namespace arm {
 
-typedef struct {
-  FreeableBuffer* processed;
-} ExecutionHandle;
-
 extern "C" {
 void __attribute__((weak)) EthosUBackend_execute_begin() {}
 void __attribute__((weak)) EthosUBackend_execute_end() {}
@@ -135,8 +94,10 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     if (handle == nullptr) {
       return Error::MemoryAllocationFailed;
     }
+    handle = new (handle) ExecutionHandle();
 
     handle->processed = processed;
+    handle->platform_state = platform_init(compile_specs, allocator);
 
     // Return the same buffer we were passed - this data will be
     // executed directly
@@ -193,6 +154,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     }
     EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
 
+    const int input_count = handles.inputs ? handles.inputs->count : 0;
+    const int output_count = handles.outputs ? handles.outputs->count : 0;
+
     MemoryAllocator* temp_allocator = context.get_temp_allocator();
     // Use a temporary allocator for the intermediate tensors of the
     // computation. The allocator is released in runtime/executor/method.cpp at
@@ -222,7 +186,7 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     // Write argument values (from EValue tensor) into Ethos-U scratch
     // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM
     //                     or DRAM output for compatible data layouts.
-    for (int i = 0; i < handles.inputs->count; i++) {
+    for (int i = 0; i < input_count; i++) {
       auto tensor_count = 1, io_count = 1;
       auto tensor_in = args[i]->toTensor();
       char* scratch_addr = ethosu_scratch + handles.inputs->io[i].offset;
@@ -291,95 +255,18 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       }
     }
 
-    // Allocate driver handle and synchronously invoke driver
-    auto driver =
-        std::unique_ptr<ethosu_driver, decltype(&ethosu_release_driver)>(
-            ethosu_reserve_driver(), ethosu_release_driver);
-    if (driver == NULL) {
-      ET_LOG(Error, "ethosu_reserve_driver failed");
-      return Error::InvalidState;
-    }
-
-    // Ethos-U low level driver expected order for Ethos U-55, we have
-    // constant weight data, then scratch (which contains input and output)
-    // scratch is written above in this function.
-
-    uint64_t bases[ETHOSU_NUM_BASE_ADDRS] = {
-        static_cast<uint64_t>(
-            reinterpret_cast<uintptr_t>((handles.weight_data))),
-        static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch)),
-        static_cast<uint64_t>(
-            reinterpret_cast<uintptr_t>(ethosu_fast_scratch))};
-    size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = {
-        handles.weight_data_size,
-        handles.scratch_data_size,
-        ethosu_fast_scratch_size};
-    int result = 0;
     EXECUTORCH_PROF_START(
         event_tracer, event_tracer_local_scope, "+EthosUBackend::execute()NPU");
-    result = ethosu_invoke_v3(
-        driver.get(),
-        static_cast<const void*>(handles.cmd_data),
-        handles.cmd_data_size,
-        bases,
-        bases_size,
-        ETHOSU_NUM_BASE_ADDRS, /* fixed array of pointers to binary interface*/
-        nullptr);
+    Error platform_status = platform_execute(
+        context,
+        execution_handle,
+        handles,
+        input_count,
+        output_count,
+        args,
+        ethosu_scratch);
     EXECUTORCH_PROF_END(event_tracer, event_tracer_local_scope);
-
-    if (result != 0) {
-      ET_LOG(Error, "Ethos-U invocation failed error (%d)", result);
-      return Error::InvalidProgram;
-    }
-    size_t tensor_bytes_total = 0;
-    size_t io_bytes_total = 0;
-    // Write outputs from scratch into EValue pointers
-    for (int i = 0; i < handles.outputs->count; i++) {
-      int tensor_count = 1, io_count = 1;
-      const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset;
-      // Process input EValue into scratch
-      // Outputs are in the index immediately after inputs
-      auto tensor_out = args[handles.inputs->count + i]->toTensor();
-
-      calculate_dimensions(
-          tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);
-
-      size_t tensor_bytes = tensor_out.nbytes();
-      size_t io_bytes = static_cast<size_t>(io_count) *
-          static_cast<size_t>(handles.outputs->io[i].elem_size);
-
-      if (tensor_bytes != io_bytes) {
-        Error status = copy_with_layout_adjustment(
-            handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes);
-        if (status != Error::Ok) {
-          return status;
-        }
-        io_bytes_total += tensor_bytes;
-      } else {
-        EXECUTORCH_PROF_SCOPE(
-            event_tracer, "+EthosUBackend::execute()handles.output.memcpy()");
-
-        memcpy(
-            tensor_out.mutable_data_ptr<char>(),
-            static_cast<const char*>(output_addr),
-            tensor_bytes);
-        io_bytes_total += io_bytes;
-      }
-
-      // At times the topological order of the outputs may change.
-      // Lets instead ensure that the sum of output bytes match.
-      tensor_bytes_total += tensor_bytes;
-    }
-    if (tensor_bytes_total != io_bytes_total) {
-      ET_LOG(Error, "Total output tensor sizes do not match");
-      ET_LOG(
-          Error,
-          "Program expects %zu bytes but got %zu",
-          io_bytes_total,
-          tensor_bytes_total);
-      return Error::InvalidProgram;
-    }
-    return Error::Ok;
+    return platform_status;
   }
 
   void destroy(DelegateHandle* handle) const override {
@@ -387,162 +274,126 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
   }
 
  private:
-  // Copies Vela output into the ExecuTorch tensor, adjusting for padding or
-  // packed layouts produced by the delegate.
-  Error copy_with_layout_adjustment(
-      const VelaIO& output_io,
-      int output_index,
-      const char* src,
-      executorch::aten::Tensor& tensor_out,
-      size_t tensor_bytes) const {
-    const int elem_size = output_io.elem_size;
-    if (elem_size == 0) {
-      ET_LOG(
-          Error, "Ethos-U output %d reports zero element size", output_index);
-      return Error::InvalidProgram;
-    }
-
-    size_t chunk_count = 1;
-    for (int dim = 0; dim < shapeDim - 1; ++dim) {
-      const int vela_dim = output_io.shape[dim];
-      chunk_count *= static_cast<size_t>(vela_dim == 0 ? 1 : vela_dim);
-    }
-    const int last_dim = output_io.shape[shapeDim - 1];
-    const size_t vela_chunk_elems =
-        static_cast<size_t>(last_dim == 0 ? 1 : last_dim);
-    const size_t vela_chunk_size =
-        vela_chunk_elems * static_cast<size_t>(elem_size);
+  // No platform-specific members.
+};
 
-    if (tensor_bytes % chunk_count != 0) {
-      ET_LOG(
-          Error,
-          "Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu",
-          output_index,
-          tensor_bytes,
-          chunk_count);
-      return Error::InvalidProgram;
-    }
+Error copy_with_layout_adjustment(
+    const VelaIO& output_io,
+    int output_index,
+    const char* src,
+    executorch::aten::Tensor& tensor_out,
+    size_t tensor_bytes) {
+  const int elem_size = output_io.elem_size;
+  if (elem_size == 0) {
+    ET_LOG(Error, "Ethos-U output %d reports zero element size", output_index);
+    return Error::InvalidProgram;
+  }
 
-    const size_t chunk_size = tensor_bytes / chunk_count;
+  size_t chunk_count = 1;
+  for (int dim = 0; dim < shapeDim - 1; ++dim) {
+    const int vela_dim = output_io.shape[dim];
+    chunk_count *= static_cast<size_t>(vela_dim == 0 ? 1 : vela_dim);
+  }
+  const int last_dim = output_io.shape[shapeDim - 1];
+  const size_t vela_chunk_elems =
+      static_cast<size_t>(last_dim == 0 ? 1 : last_dim);
+  const size_t vela_chunk_size =
+      vela_chunk_elems * static_cast<size_t>(elem_size);
 
-    // If Vela writes fewer bytes than the tensor expects we may need to
-    // expand 4-bit data to 8-bit. Ethos-U outputs may be
-    // packed 4-bit values but ExecuTorch tensors are at least 8-bit.
-    if (vela_chunk_size < chunk_size) {
-      if (chunk_size % vela_chunk_size != 0) {
-        ET_LOG(
-            Error,
-            "Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu",
-            output_index,
-            chunk_size,
-            vela_chunk_size);
-        return Error::InvalidProgram;
-      }
+  if (tensor_bytes % chunk_count != 0) {
+    ET_LOG(
+        Error,
+        "Ethos-U output %d tensor bytes %zu not divisible by chunk count %zu",
+        output_index,
+        tensor_bytes,
+        chunk_count);
+    return Error::InvalidProgram;
+  }
 
-      const size_t expand_factor = chunk_size / vela_chunk_size;
-      if (expand_factor == 2 && elem_size == 1 &&
-          tensor_out.scalar_type() == ScalarType::Char) {
-        return unpack_chunks_4bit_to_int8(
-            reinterpret_cast<const uint8_t*>(src),
-            tensor_out.mutable_data_ptr<int8_t>(),
-            chunk_count,
-            chunk_size,
-            vela_chunk_size);
-      }
+  const size_t chunk_size = tensor_bytes / chunk_count;
 
+  // If Vela writes fewer bytes than the tensor expects we may need to
+  // expand 4-bit data to 8-bit. Ethos-U outputs may be
+  // packed 4-bit values but ExecuTorch tensors are at least 8-bit.
+  if (vela_chunk_size < chunk_size) {
+    if (chunk_size % vela_chunk_size != 0) {
       ET_LOG(
           Error,
-          "Ethos-U output %d expansion factor %zu with element size %d not supported",
+          "Ethos-U output %d chunk bytes %zu not divisible by vela chunk bytes %zu",
           output_index,
-          expand_factor,
-          elem_size);
+          chunk_size,
+          vela_chunk_size);
       return Error::InvalidProgram;
     }
 
-    return strip_delegate_padding(
-        src,
-        tensor_out.mutable_data_ptr<char>(),
-        chunk_count,
-        chunk_size,
-        vela_chunk_size);
-  }
-
-  Error unpack_chunks_4bit_to_int8(
-      const uint8_t* src,
-      int8_t* dest,
-      size_t chunk_count,
-      size_t dest_chunk_size,
-      size_t src_chunk_size) const {
-    const uint8_t* chunk_src = src;
-    int8_t* chunk_dest = dest;
-    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
-      unpack_single_chunk_4bit_to_int8(chunk_src, chunk_dest, src_chunk_size);
-      chunk_src += src_chunk_size;
-      chunk_dest += dest_chunk_size;
-    }
-    return Error::Ok;
-  }
-
-  void unpack_single_chunk_4bit_to_int8(
-      const uint8_t* src,
-      int8_t* dest,
-      size_t chunk_size) const {
-    for (size_t byte_idx = 0; byte_idx < chunk_size; ++byte_idx) {
-      const uint8_t packed = src[byte_idx];
-      int8_t low = static_cast<int8_t>(packed & 0x0F);
-      int8_t high = static_cast<int8_t>((packed >> 4) & 0x0F);
-      if (low >= 8) {
-        low -= 16;
-      }
-      if (high >= 8) {
-        high -= 16;
+    const size_t expand_factor = chunk_size / vela_chunk_size;
+    if (expand_factor == 2 && elem_size == 1 &&
+        tensor_out.scalar_type() == ScalarType::Char) {
+      const uint8_t* src_bytes = reinterpret_cast<const uint8_t*>(src);
+      int8_t* dest = tensor_out.mutable_data_ptr<int8_t>();
+      const uint8_t* chunk_src = src_bytes;
+      int8_t* chunk_dest = dest;
+      for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+        for (size_t byte_idx = 0; byte_idx < vela_chunk_size; ++byte_idx) {
+          const uint8_t packed = chunk_src[byte_idx];
+          int8_t low = static_cast<int8_t>(packed & 0x0F);
+          int8_t high = static_cast<int8_t>((packed >> 4) & 0x0F);
+          if (low >= 8) {
+            low -= 16;
+          }
+          if (high >= 8) {
+            high -= 16;
+          }
+          chunk_dest[2 * byte_idx] = low;
+          chunk_dest[2 * byte_idx + 1] = high;
+        }
+        chunk_src += vela_chunk_size;
+        chunk_dest += chunk_size;
       }
-      dest[2 * byte_idx] = low;
-      dest[2 * byte_idx + 1] = high;
+      return Error::Ok;
     }
+
+    ET_LOG(
+        Error,
+        "Ethos-U output %d expansion factor %zu with element size %d not supported",
+        output_index,
+        expand_factor,
+        elem_size);
+    return Error::InvalidProgram;
   }
 
-  Error strip_delegate_padding(
-      const char* src,
-      char* dest,
-      size_t chunk_count,
-      size_t dest_chunk_size,
-      size_t src_chunk_size) const {
-    if (dest_chunk_size > src_chunk_size) {
-      ET_LOG(
-          Error,
-          "dest chunk size %zu must not exceed src chunk size %zu",
-          dest_chunk_size,
-          src_chunk_size);
-      return Error::InvalidProgram;
-    }
-    if (src == nullptr || dest == nullptr) {
-      ET_LOG(Error, "Ethos-U padded copy received null buffer");
-      return Error::InvalidState;
-    }
-    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
-      memcpy(dest, src, dest_chunk_size);
-      src += src_chunk_size;
-      dest += dest_chunk_size;
-    }
-    return Error::Ok;
+  if (src == nullptr) {
+    ET_LOG(Error, "Ethos-U padded copy received null buffer");
+    return Error::InvalidState;
+  }
+  char* dest = tensor_out.mutable_data_ptr<char>();
+  if (dest == nullptr) {
+    ET_LOG(Error, "Ethos-U padded copy received null destination");
+    return Error::InvalidState;
+  }
+  const char* src_bytes = src;
+  for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+    memcpy(dest, src_bytes, chunk_size);
+    src_bytes += vela_chunk_size;
+    dest += chunk_size;
   }
+  return Error::Ok;
+}
 
-  void calculate_dimensions(
-      const executorch::aten::Tensor tensor,
-      VelaIO* io,
-      int* tensor_count,
-      int* io_count) const {
-    for (int i = 0; i < tensor.dim(); i++) {
-      *tensor_count = *tensor_count * tensor.size(i);
-    }
+void calculate_dimensions(
+    const executorch::aten::Tensor tensor,
+    VelaIO* io,
+    int* tensor_count,
+    int* io_count) {
+  for (int i = 0; i < tensor.dim(); i++) {
+    *tensor_count = *tensor_count * tensor.size(i);
+  }
 
-    // The VelaIO type has a shape of fixed size 6
-    for (int i = 0; i < shapeDim; i++) {
-      *io_count = *io_count * io->shape[i];
-    }
+  // The VelaIO type has a shape of fixed size 6
+  for (int i = 0; i < shapeDim; i++) {
+    *io_count = *io_count * io->shape[i];
   }
-};
+}
 
 namespace {
 auto EthosUBackend_backend = EthosUBackend();
@@ -550,7 +401,6 @@ Backend EthosUBackend_id{"EthosUBackend", &EthosUBackend_backend};
 static executorch::runtime::Error EthosUBackend_registered =
     register_backend(EthosUBackend_id);
 
-#ifdef __ZEPHYR__
 /**
  * This function serves as a linker force-include mechanism to ensure the
  * EthosU backend module gets properly linked into the final executable,
@@ -570,7 +420,6 @@ extern "C" executorch::runtime::Error
 executorch_delegate_EthosUBackend_registered() {
   return EthosUBackend_registered;
 }
-#endif
 
 } // namespace
 
diff --git a/backends/arm/runtime/EthosUBackend_Cortex_A.cpp b/backends/arm/runtime/EthosUBackend_Cortex_A.cpp
new file mode 100644
index 00000000000..2a41fd6c037
--- /dev/null
+++ b/backends/arm/runtime/EthosUBackend_Cortex_A.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Arm backend for Ethos-U Linux driver stack, this relies on the
+ * ethos-u-linux-driver-stack for hardware interaction.
+ */
+
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <mutex>
+#include <new>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <ethosu.hpp>
+#include <uapi/ethosu.h>
+
+#include <executorch/backends/arm/runtime/EthosUBackend_Internal.h>
+#include <executorch/runtime/core/error.h>
+
+using executorch::runtime::ArrayRef;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::Error;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Span;
+
+namespace executorch {
+namespace backends {
+namespace arm {
+
+constexpr int64_t kDefaultEthosUTimeoutNs = 60000000000LL;
+
+struct LinuxDriverOptions {
+  std::string device_path = "/dev/ethosu0";
+  int64_t timeout_ns = kDefaultEthosUTimeoutNs;
+  bool enable_cycle_counter = true;
+  std::array<uint32_t, ETHOSU_PMU_EVENT_MAX> pmu_events{};
+};
+
+struct PlatformState {
+  LinuxDriverOptions options;
+};
+
+namespace {
+
+template <typename T>
+bool read_scalar_value(const CompileSpec& spec, T* out) {
+  if (spec.value.buffer == nullptr || spec.value.nbytes != sizeof(T)) {
+    return false;
+  }
+  std::memcpy(out, spec.value.buffer, sizeof(T));
+  return true;
+}
+
+std::string read_string_value(const CompileSpec& spec) {
+  if (spec.value.buffer == nullptr || spec.value.nbytes == 0) {
+    return "";
+  }
+  const char* raw_begin = static_cast<const char*>(spec.value.buffer);
+  const char* raw_end = raw_begin + spec.value.nbytes;
+  std::string result(raw_begin, raw_end);
+  while (!result.empty() && result.back() == '\0') {
+    result.pop_back();
+  }
+  return result;
+}
+
+LinuxDriverOptions parse_linux_options(ArrayRef<CompileSpec> specs) {
+  LinuxDriverOptions options;
+  constexpr char kDeviceKey[] = "ethosu.device";
+  constexpr char kTimeoutKey[] = "ethosu.timeout_ns";
+  constexpr char kCycleCounterKey[] = "ethosu.enable_cycle_counter";
+  constexpr char kPmuPrefix[] = "ethosu.pmu_event";
+
+  for (const CompileSpec& spec : specs) {
+    if (spec.key == nullptr) {
+      continue;
+    }
+
+    if (strcmp(spec.key, kDeviceKey) == 0) {
+      std::string device_path = read_string_value(spec);
+      if (!device_path.empty()) {
+        options.device_path = device_path;
+      }
+      continue;
+    }
+
+    if (strcmp(spec.key, kTimeoutKey) == 0) {
+      int64_t timeout = 0;
+      if (read_scalar_value(spec, &timeout) && timeout > 0) {
+        options.timeout_ns = timeout;
+      }
+      continue;
+    }
+
+    if (strcmp(spec.key, kCycleCounterKey) == 0) {
+      uint8_t enabled = 0;
+      if (read_scalar_value(spec, &enabled)) {
+        options.enable_cycle_counter = enabled != 0;
+      }
+      continue;
+    }
+
+    if (strncmp(spec.key, kPmuPrefix, strlen(kPmuPrefix)) == 0) {
+      const char* index_str = spec.key + strlen(kPmuPrefix);
+      char* endptr = nullptr;
+      long idx = std::strtol(index_str, &endptr, 10);
+      if (endptr != index_str && idx >= 0 &&
+          idx < static_cast<long>(ETHOSU_PMU_EVENT_MAX)) {
+        uint32_t event = 0;
+        if (read_scalar_value(spec, &event)) {
+          options.pmu_events[static_cast<size_t>(idx)] = event;
+        }
+      }
+    }
+  }
+
+  return options;
+}
+
+class EthosULinuxDeviceCache {
+ public:
+  EthosU::Device& get(const std::string& device_path) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!device_ || device_path != active_path_) {
+      device_ = std::make_unique<EthosU::Device>(device_path.c_str());
+      active_path_ = device_path;
+    }
+    return *device_;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::string active_path_;
+  std::unique_ptr<EthosU::Device> device_;
+};
+
+EthosULinuxDeviceCache& get_linux_device_cache() {
+  static EthosULinuxDeviceCache cache;
+  return cache;
+}
+
+const char* inference_status_to_string(EthosU::InferenceStatus status) {
+  switch (status) {
+    case EthosU::InferenceStatus::OK:
+      return "OK";
+    case EthosU::InferenceStatus::ERROR:
+      return "ERROR";
+    case EthosU::InferenceStatus::RUNNING:
+      return "RUNNING";
+    case EthosU::InferenceStatus::REJECTED:
+      return "REJECTED";
+    case EthosU::InferenceStatus::ABORTED:
+      return "ABORTED";
+    case EthosU::InferenceStatus::ABORTING:
+      return "ABORTING";
+    case EthosU::InferenceStatus::PENDING:
+      return "PENDING";
+  }
+  return "UNKNOWN";
+}
+
+Error invoke_linux_driver(
+    const VelaHandles& handles,
+    const std::vector<const char*>& input_ptrs,
+    const std::vector<char*>& output_ptrs,
+    const std::vector<size_t>& input_copy_sizes,
+    const std::vector<size_t>& output_copy_sizes,
+    const LinuxDriverOptions& options) {
+  if (handles.outputs == nullptr) {
+    ET_LOG(Error, "Ethos-U backend missing output metadata");
+    return Error::InvalidProgram;
+  }
+
+  try {
+    EthosU::Device& device = get_linux_device_cache().get(options.device_path);
+    auto network = std::make_shared<EthosU::Network>(
+        device,
+        reinterpret_cast<const unsigned char*>(handles.cmd_data),
+        handles.cmd_data_size);
+
+    std::shared_ptr<EthosU::Buffer> constant_buffer =
+        std::make_shared<EthosU::Buffer>();
+    if (handles.weight_data_size > 0) {
+      auto constant_buffers = device.createBuffers({handles.weight_data_size});
+      constant_buffer = constant_buffers.front();
+      constant_buffer->write(
+          const_cast<char*>(handles.weight_data), handles.weight_data_size);
+    }
+
+    std::shared_ptr<EthosU::Buffer> intermediate_buffer =
+        std::make_shared<EthosU::Buffer>();
+    if (handles.scratch_data_size > 0) {
+      auto scratch_buffers = device.createBuffers({handles.scratch_data_size});
+      intermediate_buffer = scratch_buffers.front();
+    }
+
+    std::vector<std::shared_ptr<EthosU::Buffer>> ifm_buffers;
+    if (handles.inputs != nullptr && handles.inputs->count > 0) {
+      if (input_copy_sizes.size() !=
+          static_cast<size_t>(handles.inputs->count)) {
+        ET_LOG(
+            Error,
+            "Mismatch between input metadata (%d) and copy plan (%zu)",
+            handles.inputs->count,
+            input_copy_sizes.size());
+        return Error::InvalidProgram;
+      }
+      if (input_ptrs.size() != input_copy_sizes.size()) {
+        ET_LOG(
+            Error,
+            "Mismatch between input metadata and runtime pointers (%zu vs %zu)",
+            input_ptrs.size(),
+            input_copy_sizes.size());
+        return Error::InvalidState;
+      }
+      ifm_buffers = device.createBuffers(input_copy_sizes);
+      for (int i = 0; i < handles.inputs->count; ++i) {
+        const size_t copy_size = input_copy_sizes[i];
+        if (copy_size == 0) {
+          continue;
+        }
+        const char* src = input_ptrs[i];
+        if (src == nullptr) {
+          ET_LOG(Error, "Missing input buffer for index %d", i);
+          return Error::InvalidState;
+        }
+        ifm_buffers[i]->write(const_cast<char*>(src), copy_size);
+      }
+    }
+
+    if (output_copy_sizes.size() !=
+        static_cast<size_t>(handles.outputs->count)) {
+      ET_LOG(
+          Error,
+          "Mismatch between output metadata (%d) and copy plan (%zu)",
+          handles.outputs->count,
+          output_copy_sizes.size());
+      return Error::InvalidProgram;
+    }
+    if (output_ptrs.size() != output_copy_sizes.size()) {
+      ET_LOG(
+          Error,
+          "Mismatch between output metadata and runtime buffers (%zu vs %zu)",
+          output_ptrs.size(),
+          output_copy_sizes.size());
+      return Error::InvalidState;
+    }
+    auto ofm_buffers = device.createBuffers(output_copy_sizes);
+
+    auto inference = std::make_unique<EthosU::Inference>(
+        network,
+        ifm_buffers.begin(),
+        ifm_buffers.end(),
+        ofm_buffers.begin(),
+        ofm_buffers.end(),
+        intermediate_buffer,
+        constant_buffer,
+        options.pmu_events,
+        options.enable_cycle_counter);
+
+    if (inference->wait(options.timeout_ns)) {
+      ET_LOG(
+          Error,
+          "Ethos-U inference timed out after %lld ns",
+          static_cast<long long>(options.timeout_ns));
+      return Error::InvalidState;
+    }
+
+    auto status = inference->status();
+    if (status != EthosU::InferenceStatus::OK) {
+      ET_LOG(
+          Error,
+          "Ethos-U inference failed with status %s",
+          inference_status_to_string(status));
+      return Error::InvalidState;
+    }
+
+    if (options.enable_cycle_counter) {
+      try {
+        uint64_t cycles = inference->getCycleCounter();
+        ET_LOG(
+            Info,
+            "Ethos-U Linux delegate cycle counter: %llu",
+            static_cast<unsigned long long>(cycles));
+      } catch (const std::exception& e) {
+        ET_LOG(Debug, "Failed to read Ethos-U cycle counter: %s", e.what());
+      }
+    }
+
+    for (int i = 0; i < handles.outputs->count; ++i) {
+      const size_t copy_size = output_copy_sizes[i];
+      if (copy_size == 0) {
+        continue;
+      }
+      char* dst = output_ptrs[i];
+      if (dst == nullptr) {
+        ET_LOG(Error, "Missing output buffer for index %d", i);
+        return Error::InvalidState;
+      }
+      ofm_buffers[i]->read(dst, copy_size);
+    }
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "Ethos-U Linux driver invocation failed: %s", e.what());
+    return Error::InvalidState;
+  }
+
+  return Error::Ok;
+}
+} // namespace
+
+PlatformState* platform_init(
+    ArrayRef<CompileSpec> specs,
+    MemoryAllocator* allocator) {
+  if (allocator == nullptr) {
+    return nullptr;
+  }
+  PlatformState* state = allocator->allocateInstance<PlatformState>();
+  if (state == nullptr) {
+    return nullptr;
+  }
+  state = new (state) PlatformState();
+  state->options = parse_linux_options(specs);
+  return state;
+}
+
+Error platform_execute(
+    BackendExecutionContext& /*context*/,
+    const ExecutionHandle* execution_handle,
+    const VelaHandles& handles,
+    int input_count,
+    int output_count,
+    Span<executorch::runtime::EValue*> args,
+    char* /*ethosu_scratch*/) {
+  std::vector<size_t> input_copy_sizes;
+  std::vector<const char*> linux_input_ptrs;
+  if (input_count > 0) {
+    input_copy_sizes.resize(input_count, 0);
+    linux_input_ptrs.resize(input_count, nullptr);
+  }
+
+  std::vector<size_t> output_io_bytes;
+  std::vector<char*> linux_output_ptrs;
+  if (output_count > 0) {
+    output_io_bytes.resize(output_count, 0);
+    linux_output_ptrs.resize(output_count, nullptr);
+  }
+
+  for (int i = 0; i < input_count; ++i) {
+    auto tensor_in = args[i]->toTensor();
+    linux_input_ptrs[i] = tensor_in.mutable_data_ptr<char>();
+    input_copy_sizes[i] = tensor_in.nbytes();
+  }
+
+  if (handles.outputs != nullptr) {
+    for (int i = 0; i < output_count; ++i) {
+      int tensor_count = 1, io_count = 1;
+      auto tensor_out = args[input_count + i]->toTensor();
+      calculate_dimensions(
+          tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);
+      if (i < static_cast<int>(output_io_bytes.size())) {
+        output_io_bytes[i] = static_cast<size_t>(io_count) *
+            static_cast<size_t>(handles.outputs->io[i].elem_size);
+      }
+      linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
+    }
+  }
+
+  const PlatformState* state = execution_handle->platform_state;
+  if (state == nullptr) {
+    ET_LOG(Error, "Ethos-U Linux backend missing platform state");
+    return Error::InvalidState;
+  }
+
+  return invoke_linux_driver(
+      handles,
+      linux_input_ptrs,
+      linux_output_ptrs,
+      input_copy_sizes,
+      output_io_bytes,
+      state->options);
+}
+
+} // namespace arm
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
new file mode 100644
index 00000000000..7e6e9f5efaf
--- /dev/null
+++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Arm backend for Ethos-U baremetal driver stack, this relies on the
+ * ethos-u-core-driver for hardware interaction.
+ */
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include <ethosu_driver.h>
+
+#include <executorch/backends/arm/runtime/EthosUBackend_Internal.h>
+#include <executorch/runtime/core/error.h>
+
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::Error;
+using executorch::runtime::Span;
+
+namespace executorch {
+namespace backends {
+namespace arm {
+
+struct PlatformState {};
+
+PlatformState* platform_init(
+    executorch::runtime::ArrayRef<executorch::runtime::CompileSpec> /*specs*/,
+    executorch::runtime::MemoryAllocator* /*allocator*/) {
+  return nullptr;
+}
+
+Error platform_execute(
+    BackendExecutionContext& /*context*/,
+    const ExecutionHandle* /*execution_handle*/,
+    const VelaHandles& handles,
+    int input_count,
+    int output_count,
+    Span<executorch::runtime::EValue*> args,
+    char* ethosu_scratch) {
+  // Allocate driver handle and synchronously invoke driver
+  auto driver =
+      std::unique_ptr<ethosu_driver, decltype(&ethosu_release_driver)>(
+          ethosu_reserve_driver(), ethosu_release_driver);
+  if (driver == nullptr) {
+    ET_LOG(Error, "ethosu_reserve_driver failed");
+    return Error::InvalidState;
+  }
+
+  // Ethos-U low level driver expected order for Ethos U-55, we have
+  // constant weight data, then scratch (which contains input and output)
+  // scratch is written above in this function.
+  uint64_t bases[ETHOSU_NUM_BASE_ADDRS] = {
+      static_cast<uint64_t>(reinterpret_cast<uintptr_t>((handles.weight_data))),
+      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_scratch)),
+      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ethosu_fast_scratch))};
+  size_t bases_size[ETHOSU_NUM_BASE_ADDRS] = {
+      handles.weight_data_size,
+      handles.scratch_data_size,
+      ethosu_fast_scratch_size};
+  int result = ethosu_invoke_v3(
+      driver.get(),
+      static_cast<const void*>(handles.cmd_data),
+      handles.cmd_data_size,
+      bases,
+      bases_size,
+      ETHOSU_NUM_BASE_ADDRS, /* fixed array of pointers to binary interface*/
+      nullptr);
+
+  if (result != 0) {
+    ET_LOG(Error, "Ethos-U invocation failed error (%d)", result);
+    return Error::InvalidProgram;
+  }
+
+  size_t tensor_bytes_total = 0;
+  size_t io_bytes_total = 0;
+  // Write outputs from scratch into EValue pointers
+  for (int i = 0; i < output_count; i++) {
+    int tensor_count = 1, io_count = 1;
+    const char* output_addr = ethosu_scratch + handles.outputs->io[i].offset;
+    // Process input EValue into scratch
+    // Outputs are in the index immediately after inputs
+    auto tensor_out = args[input_count + i]->toTensor();
+
+    calculate_dimensions(
+        tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);
+
+    size_t tensor_bytes = tensor_out.nbytes();
+    size_t io_bytes = static_cast<size_t>(io_count) *
+        static_cast<size_t>(handles.outputs->io[i].elem_size);
+
+    if (tensor_bytes != io_bytes) {
+      Error status = copy_with_layout_adjustment(
+          handles.outputs->io[i], i, output_addr, tensor_out, tensor_bytes);
+      if (status != Error::Ok) {
+        return status;
+      }
+      io_bytes_total += tensor_bytes;
+    } else {
+      memcpy(
+          tensor_out.mutable_data_ptr<char>(),
+          static_cast<const char*>(output_addr),
+          tensor_bytes);
+      io_bytes_total += io_bytes;
+    }
+
+    // At times the topological order of the outputs may change.
+    // Lets instead ensure that the sum of output bytes match.
+    tensor_bytes_total += tensor_bytes;
+  }
+  if (tensor_bytes_total != io_bytes_total) {
+    ET_LOG(Error, "Total output tensor sizes do not match");
+    ET_LOG(
+        Error,
+        "Program expects %zu bytes but got %zu",
+        io_bytes_total,
+        tensor_bytes_total);
+    return Error::InvalidProgram;
+  }
+  return Error::Ok;
+}
+
+} // namespace arm
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/EthosUBackend_Internal.h b/backends/arm/runtime/EthosUBackend_Internal.h
new file mode 100644
index 00000000000..d1e543f07df
--- /dev/null
+++ b/backends/arm/runtime/EthosUBackend_Internal.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// Workaround for runtime/core/portable_type/c10/c10/util/Float16-math.h
+#if defined(__GNUC__) && defined(__ZEPHYR__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/backends/arm/runtime/VelaBinStream.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#if defined(__GNUC__) && defined(__ZEPHYR__)
+#pragma GCC diagnostic pop
+#endif
+
+#if defined(ET_EVENT_TRACER_ENABLED)
+#include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/event_tracer_hooks.h>
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerEntry;
+
+class EventTraceScope {
+ public:
+  EventTraceScope(EventTracer* event_tracer_, const char* name) {
+    event_tracer = event_tracer_;
+    event_tracer_entry_scope = event_tracer->start_profiling(name);
+  }
+  ~EventTraceScope() {
+    event_tracer->end_profiling(event_tracer_entry_scope);
+  }
+
+ private:
+  EventTracer* event_tracer;
+  EventTracerEntry event_tracer_entry_scope;
+};
+#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME) \
+  EventTraceScope event_tracer_scope = EventTraceScope(EVENTTRACER, NAME)
+#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME) \
+  SCOPE = EVENTTRACER->start_profiling(NAME)
+#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE) \
+  EVENTTRACER->end_profiling(SCOPE)
+#else
+#define EXECUTORCH_PROF_SCOPE(EVENTTRACER, NAME)
+#define EXECUTORCH_PROF_START(EVENTTRACER, SCOPE, NAME)
+#define EXECUTORCH_PROF_END(EVENTTRACER, SCOPE)
+#endif
+
+#define ETHOSU_NUM_BASE_ADDRS 3
+
+namespace executorch {
+namespace backends {
+namespace arm {
+
+struct PlatformState;
+
+struct ExecutionHandle {
+  executorch::runtime::FreeableBuffer* processed;
+  PlatformState* platform_state;
+};
+
+extern "C" {
+void EthosUBackend_execute_begin();
+void EthosUBackend_execute_end();
+extern unsigned char* ethosu_fast_scratch;
+extern size_t ethosu_fast_scratch_size;
+}
+
+PlatformState* platform_init(
+    executorch::runtime::ArrayRef<executorch::runtime::CompileSpec> specs,
+    executorch::runtime::MemoryAllocator* allocator);
+executorch::runtime::Error platform_execute(
+    executorch::runtime::BackendExecutionContext& context,
+    const ExecutionHandle* execution_handle,
+    const VelaHandles& handles,
+    int input_count,
+    int output_count,
+    executorch::runtime::Span<executorch::runtime::EValue*> args,
+    char* ethosu_scratch);
+
+executorch::runtime::Error copy_with_layout_adjustment(
+    const VelaIO& output_io,
+    int output_index,
+    const char* src,
+    executorch::aten::Tensor& tensor_out,
+    size_t tensor_bytes);
+
+void calculate_dimensions(
+    const executorch::aten::Tensor tensor,
+    VelaIO* io,
+    int* tensor_count,
+    int* io_count);
+
+} // namespace arm
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VelaBinStream.cpp b/backends/arm/runtime/VelaBinStream.cpp
index c8d568499c9..70c5b0c7666 100644
--- a/backends/arm/runtime/VelaBinStream.cpp
+++ b/backends/arm/runtime/VelaBinStream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023, 2025 Arm Limited and/or its affiliates.
+ * Copyright 2023, 2025-2026 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -65,8 +65,10 @@ bool vela_bin_read(const char* data, VelaHandles* handles, int size) {
         return false;
     } else if (!strncmp(b->name, "cmd_data", strlen("cmd_data"))) {
       // This driver magic header confirms a valid command stream in binary
-      if (strncmp(b->data, "COP1", strlen("COP1")))
+      if (strncmp(b->data, "COP1", strlen("COP1")) &&
+          strncmp(b->data, "COP2", strlen("COP2"))) {
         return false;
+      }
       handles->cmd_data = b->data;
       handles->cmd_data_size = b->size;
     } else if (!strncmp(b->name, "weight_data", strlen("weight_data"))) {
diff --git a/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake b/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake
new file mode 100644
index 00000000000..e4b8af62067
--- /dev/null
+++ b/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake
@@ -0,0 +1,79 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Minimum version kept low so this file can be used even if the global project
+# requires a newer CMake.
+cmake_minimum_required(VERSION 3.20)
+
+# Toolchain root for the standalone aarch64-linux-musl cross compiler.
+set(MUSL_TOOLCHAIN_ROOT
+    ""
+    CACHE PATH "Root of the aarch64-linux-musl toolchain"
+)
+if(MUSL_TOOLCHAIN_ROOT STREQUAL "" AND DEFINED ENV{MUSL_TOOLCHAIN_ROOT})
+  set(MUSL_TOOLCHAIN_ROOT "$ENV{MUSL_TOOLCHAIN_ROOT}")
+endif()
+if(MUSL_TOOLCHAIN_ROOT STREQUAL "")
+  message(
+    FATAL_ERROR
+      "MUSL_TOOLCHAIN_ROOT is required (e.g. -DMUSL_TOOLCHAIN_ROOT=/path/to/aarch64-linux-musl-cross or export MUSL_TOOLCHAIN_ROOT=...)"
+  )
+endif()
+
+# Ensure the toolchain root is forwarded to try_compile checks.
+set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES MUSL_TOOLCHAIN_ROOT)
+set(_MUSL_SYSROOT "${MUSL_TOOLCHAIN_ROOT}/aarch64-linux-musl")
+set(_MUSL_BIN_DIR "${MUSL_TOOLCHAIN_ROOT}/bin")
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_SYSROOT
+    "${_MUSL_SYSROOT}"
+    CACHE PATH "Musl target sysroot"
+)
+
+set(CMAKE_C_COMPILER
+    "${_MUSL_BIN_DIR}/aarch64-linux-musl-gcc"
+    CACHE FILEPATH "Musl cross C compiler"
+)
+set(CMAKE_CXX_COMPILER
+    "${_MUSL_BIN_DIR}/aarch64-linux-musl-g++"
+    CACHE FILEPATH "Musl cross C++ compiler"
+)
+set(CMAKE_AR
+    "${_MUSL_BIN_DIR}/aarch64-linux-musl-ar"
+    CACHE FILEPATH "Musl archiver"
+)
+set(CMAKE_RANLIB
+    "${_MUSL_BIN_DIR}/aarch64-linux-musl-ranlib"
+    CACHE FILEPATH "Musl ranlib"
+)
+set(CMAKE_STRIP
+    "${_MUSL_BIN_DIR}/aarch64-linux-musl-strip"
+    CACHE FILEPATH "Musl strip"
+)
+
+set(CMAKE_FIND_ROOT_PATH "${CMAKE_SYSROOT}")
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+if(DEFINED ENV{PKG_CONFIG_SYSROOT_DIR})
+  set(ENV{PKG_CONFIG_SYSROOT_DIR} $ENV{PKG_CONFIG_SYSROOT_DIR})
+else()
+  set(ENV{PKG_CONFIG_SYSROOT_DIR} ${CMAKE_SYSROOT})
+endif()
+
+if(DEFINED ENV{PKG_CONFIG_PATH})
+  set(ENV{PKG_CONFIG_PATH}
+      "${CMAKE_SYSROOT}/usr/lib/pkgconfig:${CMAKE_SYSROOT}/usr/share/pkgconfig:$ENV{PKG_CONFIG_PATH}"
+  )
+else()
+  set(ENV{PKG_CONFIG_PATH}
+      "${CMAKE_SYSROOT}/usr/lib/pkgconfig:${CMAKE_SYSROOT}/usr/share/pkgconfig"
+  )
+endif()
diff --git a/examples/arm/executor_runner/ethosu_link_helper.cpp b/examples/arm/executor_runner/ethosu_link_helper.cpp
new file mode 100644
index 00000000000..3130dfbd78b
--- /dev/null
+++ b/examples/arm/executor_runner/ethosu_link_helper.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Helper to force-link the Ethos-U backend when building the portable runner.
+
+#if defined(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
+#include <executorch/runtime/core/error.h>
+
+extern "C" ::executorch::runtime::Error
+executorch_delegate_EthosUBackend_registered();
+
+namespace {
+struct EthosULinkHook {
+  EthosULinkHook() {
+    // Force linker to keep the Ethos-U backend object file.
+    (void)executorch_delegate_EthosUBackend_registered();
+  }
+};
+
+static EthosULinkHook g_link_hook;
+} // namespace
+#endif // EXECUTORCH_BUILD_ARM_ETHOSU_LINUX
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 9a16f1ae4f4..5a7d777580c 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -63,6 +63,10 @@ define_overridable_option(
   EXECUTORCH_BUILD_ARM_BAREMETAL
   "Build the Arm Baremetal flow for Cortex-M and Ethos-U" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_ARM_ETHOSU_LINUX
+  "Build the Arm Ethos-U backend for the Linux driver stack" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_KERNELS_LLM "Build the custom kernels" BOOL OFF
 )
@@ -232,6 +236,11 @@ check_conflicting_options_on(
   EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
 )
 
+check_conflicting_options_on(
+  IF_ON EXECUTORCH_BUILD_ARM_ETHOSU_LINUX CONFLICTS_WITH
+  EXECUTORCH_BUILD_ARM_BAREMETAL
+)
+
 # TODO(jathu): move this to platform specific presets when created
 set(_default_executorch_build_executor_runner ON)
 if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")