Skip to content

Commit af22f84

Browse files
authored
Fix CUDA cross-compilation and Windows support (#17365)
- Enable CUDA language in CMake for proper .cu file compilation - Fix compiler flag conditions to use COMPILE_LANGUAGE for multi-language targets - Use WIN32 instead of MSVC for MinGW cross-compilation dllexport symbols - Add MSVC-specific object library linking to preserve AOTI symbols - Implement proper Windows error formatting in platform.cpp - Add PAL initialization in CUDA memory shims for Windows DLL isolation - Implement missing AOTI tensor shim APIs (create_tensor_from_blob, clone) - Add logging helper function for AOTI-generated code - Add TORCH_CUDA_ARCH_LIST detection and warning for non-cross-compile scenarios - Add explicit 'configuration' field to CMake build presets for consistency - Fix ET_CHECK_TK_OK_OR_RETURN_ERROR macro to use do-while instead of statement expression This enables CUDA backend compilation on Windows with proper symbol visibility and device architecture detection during cross-compilation scenarios.
1 parent 785adf0 commit af22f84

File tree

12 files changed

+634
-70
lines changed

12 files changed

+634
-70
lines changed

.github/workflows/cuda-windows.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ jobs:
3232
model:
3333
- repo: "mistralai"
3434
name: "Voxtral-Mini-3B-2507"
35+
- repo: "nvidia"
36+
name: "parakeet-tdt"
3537
quant:
3638
- "non-quantized"
3739
- "quantized-int4-weight-only"

CMakePresets.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,9 +197,9 @@
197197
"EXECUTORCH_BUILD_CUDA": "ON"
198198
},
199199
"condition": {
200-
"lhs": "${hostSystemName}",
201-
"type": "equals",
202-
"rhs": "Linux"
200+
"type": "inList",
201+
"string": "${hostSystemName}",
202+
"list": ["Linux", "Windows"]
203203
}
204204
},
205205
{
@@ -289,6 +289,7 @@
289289
"name": "llm-release-install",
290290
"displayName": "Build and install LLM extension release artifacts",
291291
"configurePreset": "llm-release",
292+
"configuration": "Release",
292293
"targets": [
293294
"install"
294295
],
@@ -298,6 +299,7 @@
298299
"name": "llm-release-cuda-install",
299300
"displayName": "Build and install LLM extension release artifacts (CUDA)",
300301
"configurePreset": "llm-release-cuda",
302+
"configuration": "Release",
301303
"targets": [
302304
"install"
303305
],
@@ -307,6 +309,7 @@
307309
"name": "llm-release-metal-install",
308310
"displayName": "Build and install LLM extension release artifacts (Metal)",
309311
"configurePreset": "llm-release-metal",
312+
"configuration": "Release",
310313
"targets": [
311314
"install"
312315
],

backends/aoti/CMakeLists.txt

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@ target_include_directories(
3838
)
3939
target_compile_options(
4040
aoti_common
41-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
42-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
41+
PUBLIC
42+
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/EHsc /GR>
43+
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-fexceptions
44+
-frtti -fPIC>
4345
)
4446
target_compile_definitions(
4547
aoti_common PRIVATE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
@@ -107,15 +109,41 @@ endif()
107109
add_library(aoti_common_shims_slim STATIC ${_aoti_common_shims_slim_sources})
108110
target_compile_options(
109111
aoti_common_shims_slim
110-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
111-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
112+
PUBLIC
113+
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/EHsc /GR>
114+
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-fexceptions
115+
-frtti -fPIC>
112116
)
113117
target_compile_definitions(
114118
aoti_common_shims_slim PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
115119
)
116120

117121
target_link_libraries(aoti_common_shims_slim PUBLIC slimtensor ${CMAKE_DL_LIBS})
118122

123+
if(MSVC)
124+
# MSVC drops unreferenced symbols from static libs; export symbols by
125+
# including the objects directly in the CUDA shims DLL.
126+
add_library(
127+
aoti_common_shims_slim_obj OBJECT ${_aoti_common_shims_slim_sources}
128+
)
129+
target_compile_options(
130+
aoti_common_shims_slim_obj
131+
PUBLIC
132+
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/EHsc
133+
/GR>
134+
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-fexceptions
135+
-frtti
136+
-fPIC>
137+
)
138+
target_compile_definitions(
139+
aoti_common_shims_slim_obj
140+
PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
141+
)
142+
target_link_libraries(
143+
aoti_common_shims_slim_obj PUBLIC slimtensor ${CMAKE_DL_LIBS}
144+
)
145+
endif()
146+
119147
install(
120148
TARGETS aoti_common_shims_slim
121149
EXPORT ExecuTorchTargets

backends/cuda/CMakeLists.txt

Lines changed: 97 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,47 @@ set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
3333

3434
find_package(CUDAToolkit REQUIRED)
3535

36+
# Try to enable CUDA language when a working CUDA compiler toolchain is
37+
# available. Some CI environments (notably Windows packaging jobs) provide
38+
# CUDAToolkit headers/libs but cannot complete CUDA compiler identification. In
39+
# those cases, keep configuration working and skip CUDA-only sources below.
40+
if(NOT CMAKE_CUDA_COMPILER)
41+
include(CheckLanguage)
42+
check_language(CUDA)
43+
endif()
44+
45+
if(CMAKE_CUDA_COMPILER)
46+
enable_language(CUDA)
47+
endif()
48+
49+
# Centralize Windows/MSVC checks used throughout this file.
50+
set(_cuda_is_msvc_toolchain OFF)
51+
if(MSVC)
52+
set(_cuda_is_msvc_toolchain ON)
53+
endif()
54+
55+
set(_cuda_is_windows_msvc OFF)
56+
if(WIN32 AND _cuda_is_msvc_toolchain)
57+
set(_cuda_is_windows_msvc ON)
58+
endif()
59+
60+
# Common C++ compile options for CUDA backend targets.
61+
if(_cuda_is_msvc_toolchain)
62+
set(_cuda_cxx_compile_options /EHsc /GR)
63+
else()
64+
set(_cuda_cxx_compile_options -fexceptions -frtti -fPIC)
65+
endif()
66+
67+
# Platform-specific linker option for exporting symbols from shared libs.
68+
set(_cuda_export_dynamic_option "")
69+
if(NOT _cuda_is_msvc_toolchain)
70+
if(APPLE)
71+
set(_cuda_export_dynamic_option -Wl,-export_dynamic)
72+
else()
73+
set(_cuda_export_dynamic_option -Wl,--export-dynamic)
74+
endif()
75+
endif()
76+
3677
# Use ExecuTorch's standard way to find PyTorch libraries for AOTI
3778
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
3879
find_package_torch()
@@ -47,17 +88,11 @@ target_include_directories(
4788
)
4889
target_compile_options(
4990
cuda_tensor_maker
50-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
51-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
91+
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${_cuda_cxx_compile_options}>"
5292
)
5393
# Ensure symbols are exported properly
54-
if(APPLE)
55-
target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
56-
else()
57-
target_link_options(
58-
cuda_tensor_maker PUBLIC
59-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
60-
)
94+
if(_cuda_export_dynamic_option)
95+
target_link_options(cuda_tensor_maker PUBLIC ${_cuda_export_dynamic_option})
6196
endif()
6297

6398
# Link against ExecuTorch core libraries
@@ -84,8 +119,7 @@ target_include_directories(
84119

85120
target_compile_options(
86121
cuda_platform
87-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
88-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
122+
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${_cuda_cxx_compile_options}>"
89123
)
90124

91125
# Link against ExecuTorch core libraries
@@ -98,24 +132,30 @@ install(
98132
)
99133

100134
# CUDA-specific AOTI shim symbols (dynamically linked)
101-
set(_aoti_cuda_shim_sources
102-
runtime/shims/memory.cpp runtime/shims/cuda_guard.cpp
103-
runtime/shims/int4mm.cu
135+
set(_aoti_cuda_shim_sources runtime/shims/memory.cpp
136+
runtime/shims/cuda_guard.cpp
104137
)
105138

139+
# Only build int4mm shim when CUDA language/toolchain is available.
140+
if(CMAKE_CUDA_COMPILER)
141+
list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu)
142+
endif()
143+
106144
add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
107145

108146
# Define CUDA_AVAILABLE to use SlimTensor on GPU in common_shims_slim.h
109147
target_compile_definitions(aoti_cuda_shims PRIVATE CUDA_AVAILABLE=1)
110148

111-
# Define export macros for shared library
112-
if(MSVC)
149+
# Define export macros for shared library. Use WIN32 (not just MSVC) so MinGW
150+
# cross-compiles also emit dllexport symbols for AOTI shims.
151+
if(WIN32)
113152
target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS)
114-
115-
# Ensure proper DLL import/export library naming on Windows
116-
set_target_properties(
117-
aoti_cuda_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF
118-
)
153+
if(_cuda_is_windows_msvc)
154+
# Ensure proper DLL import/export library naming on Windows
155+
set_target_properties(
156+
aoti_cuda_shims PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS OFF
157+
)
158+
endif()
119159
endif()
120160

121161
target_include_directories(
@@ -126,30 +166,35 @@ target_include_directories(
126166

127167
target_compile_options(
128168
aoti_cuda_shims
129-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
130-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
169+
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${_cuda_cxx_compile_options}>"
131170
)
132171

133172
# Ensure symbols are exported properly
134-
target_link_options(
135-
aoti_cuda_shims PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
136-
)
173+
if(_cuda_export_dynamic_option)
174+
target_link_options(aoti_cuda_shims PUBLIC ${_cuda_export_dynamic_option})
175+
endif()
137176

138177
# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and
139-
# platform utilities. Use --whole-archive for aoti_common_shims_slim to ensure
140-
# all symbols are exported from this shared library.
141-
target_link_libraries(
142-
aoti_cuda_shims
143-
PRIVATE cuda_platform
144-
PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--whole-archive>
145-
aoti_common_shims_slim
146-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--no-whole-archive>
147-
cuda_tensor_maker
148-
CUDA::cudart
149-
${CMAKE_DL_LIBS}
150-
)
178+
# platform utilities. On non-MSVC, use --whole-archive for
179+
# aoti_common_shims_slim to force shim symbol retention.
180+
if(_cuda_is_msvc_toolchain)
181+
target_link_libraries(
182+
aoti_cuda_shims PRIVATE cuda_platform cuda_tensor_maker CUDA::cudart
183+
${CMAKE_DL_LIBS}
184+
)
185+
# Link object library directly so symbols are pulled exactly once while
186+
# avoiding duplicate static/object inclusion and interface leakage.
187+
target_link_libraries(aoti_cuda_shims PRIVATE aoti_common_shims_slim_obj)
188+
else()
189+
target_link_libraries(
190+
aoti_cuda_shims
191+
PRIVATE cuda_platform
192+
PUBLIC -Wl,--whole-archive aoti_common_shims_slim -Wl,--no-whole-archive
193+
cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
194+
)
195+
endif()
151196

152-
if(NOT MSVC)
197+
if(NOT _cuda_is_msvc_toolchain)
153198
executorch_target_link_options_shared_lib(aoti_cuda_shims)
154199
endif()
155200

@@ -172,14 +217,12 @@ target_include_directories(
172217
)
173218
target_compile_options(
174219
aoti_cuda_backend
175-
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
176-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
220+
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${_cuda_cxx_compile_options}>"
177221
)
178222
# Ensure symbols are exported properly
179-
target_link_options(
180-
aoti_cuda_backend PUBLIC
181-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
182-
)
223+
if(_cuda_export_dynamic_option)
224+
target_link_options(aoti_cuda_backend PUBLIC ${_cuda_export_dynamic_option})
225+
endif()
183226

184227
# Link against shims library and other dependencies On Windows (MSVC), use
185228
# PRIVATE linkage for aoti_cuda_shims since the DLL is copied to the executable
@@ -190,8 +233,15 @@ target_link_libraries(
190233
CUDA::cudart ${CMAKE_DL_LIBS}
191234
)
192235

193-
if(MSVC)
194-
target_link_libraries(aoti_cuda_backend PRIVATE aoti_cuda_shims)
236+
if(_cuda_is_msvc_toolchain)
237+
# cuda_backend.cpp uses SlimTensor CUDA utilities (e.g. getCurrentCUDAStream)
238+
# from aoti_common_shims_slim via headers; propagate the static lib so final
239+
# MSVC links (e.g. parakeet_runner) can resolve those C++ symbols.
240+
target_link_libraries(
241+
aoti_cuda_backend
242+
PRIVATE aoti_cuda_shims
243+
PUBLIC aoti_common_shims_slim
244+
)
195245
else()
196246
target_link_libraries(aoti_cuda_backend PUBLIC aoti_cuda_shims)
197247
endif()

0 commit comments

Comments
 (0)