Skip to content

Commit 711d155

Browse files
authored
[QNN EP] Enablement of 64bit Udma mode (#26677)
### Description Enabling 64bit udma mode for device architecture v81 or more ### Motivation and Context Support 64bit udma mode to run model efficiently on htp target v81 or above
1 parent a5dc0f9 commit 711d155

File tree

9 files changed

+65
-11
lines changed

9 files changed

+65
-11
lines changed

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,7 @@ Status QnnBackendManager::ResetContextPriority() {
11681168
return SetContextPriority(context_priority_);
11691169
}
11701170

1171-
Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
1171+
Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) {
11721172
if (true == context_created_) {
11731173
LOGS_DEFAULT(INFO) << "Context created already.";
11741174
return Status::OK();
@@ -1184,8 +1184,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
11841184
QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
11851185
ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));
11861186

1187+
QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT;
1188+
QnnHtpContext_CustomConfig_t udma_custom_config;
1189+
udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA;
1190+
udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode;
1191+
context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
1192+
context_config_extended_udma.customConfig = &udma_custom_config;
1193+
11871194
const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config,
11881195
&context_config_weight_sharing,
1196+
&context_config_extended_udma,
11891197
nullptr};
11901198

11911199
const QnnContext_Config_t* empty_context_configs[] = {nullptr};
@@ -1568,7 +1576,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
15681576
bool enable_vtcm_backup_buffer_sharing,
15691577
bool enable_file_mapped_weights,
15701578
std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
1571-
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map) {
1579+
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
1580+
bool enable_htp_extended_udma_mode) {
15721581
std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
15731582
if (backend_setup_completed_) {
15741583
LOGS(logger, VERBOSE) << "Backend setup already!";
@@ -1679,7 +1688,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
16791688

16801689
if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) {
16811690
status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map)
1682-
: CreateContext(enable_htp_weight_sharing);
1691+
: CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode);
16831692

16841693
if (status.IsOK()) {
16851694
LOGS(logger, VERBOSE) << "CreateContext succeed.";

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
171171
bool enable_vtcm_backup_buffer_sharing,
172172
bool enable_file_mapped_weights,
173173
std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
174-
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map);
174+
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
175+
bool enable_htp_extended_udma_mode);
175176

176177
Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
177178

@@ -299,7 +300,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
299300

300301
Status ReleaseProfilehandle();
301302

302-
Status CreateContext(bool enable_htp_weight_sharing);
303+
Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode);
303304

304305
Status GetFileSizeIfValid(const std::string& filepath, size_t& file_size);
305306

onnxruntime/core/providers/qnn/qnn_execution_provider.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,19 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
602602
}
603603
}
604604

605+
static const std::string QNN_HTP_EXTENDED_UDMA_MODE = "extended_udma";
606+
auto htp_extended_udma_pos = provider_options_map.find(QNN_HTP_EXTENDED_UDMA_MODE);
607+
if (htp_extended_udma_pos != provider_options_map.end()) {
608+
if ("1" == htp_extended_udma_pos->second) {
609+
enable_htp_extended_udma_mode_ = true;
610+
} else if ("0" == htp_extended_udma_pos->second) {
611+
enable_htp_extended_udma_mode_ = false;
612+
} else {
613+
LOGS_DEFAULT(WARNING) << "Invalid extended_udma mode: " << enable_htp_extended_udma_mode_ << " only 0 or 1 allowed. Set to 0.";
614+
}
615+
LOGS_DEFAULT(VERBOSE) << "User specified extended_udma mode: " << enable_htp_extended_udma_mode_;
616+
}
617+
605618
// Option to skip QNN API interface version check to use other QNN library other than default.
606619
static const std::string SKIP_QNN_VERSION_CHECK = "skip_qnn_version_check";
607620
auto skip_qnn_version_check = ParseBoolOption(SKIP_QNN_VERSION_CHECK, false, provider_options_map);
@@ -1006,7 +1019,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
10061019
enable_vtcm_backup_buffer_sharing_,
10071020
enable_file_mapped_weights_,
10081021
rpcmem_library_,
1009-
context_bin_map);
1022+
context_bin_map,
1023+
enable_htp_extended_udma_mode_);
10101024

10111025
context_bin_map.clear();
10121026

onnxruntime/core/providers/qnn/qnn_execution_provider.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class QNNExecutionProvider : public IExecutionProvider {
127127
qnn::ModelSettings model_settings_ = {};
128128
bool dump_json_qnn_graph_ = false;
129129
std::string json_qnn_graph_dir_ = "";
130+
bool enable_htp_extended_udma_mode_ = false;
130131

131132
// Whether this is set depends on a session option enabling it and if the RPCMEM dynamic library is available.
132133
// This is potentially shared with HtpSharedMemoryAllocator which may be returned by CreatePreferredAllocators().

onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ namespace qnnctxgen {
7373
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
7474
"\t Defaults to '1' (another EP (typically CPU EP) handles the graph I/O quantization and dequantization). \n"
7575
"\t [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary.\n"
76+
"\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
77+
"\t '0' (disabled), '1' (enabled). Default: '0'. \n"
7678
"\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
7779
"\n"
7880
"\t-h: help\n");
@@ -253,7 +255,7 @@ static bool ParsePluginEpConfig(const std::string& json_file_path, PluginEpConfi
253255
ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
254256
}
255257
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" ||
256-
key == "enable_htp_spill_fill_buffer") {
258+
key == "enable_htp_spill_fill_buffer" || key == "extended_udma") {
257259
std::unordered_set<std::string> supported_options = {"0", "1"};
258260
if (supported_options.find(value) == supported_options.end()) {
259261
std::ostringstream str_stream;
@@ -266,7 +268,7 @@ static bool ParsePluginEpConfig(const std::string& json_file_path, PluginEpConfi
266268
ORT_THROW(
267269
"Wrong key type entered. Choose from options: ['backend_type', 'backend_path', 'vtcm_mb', "
268270
"'htp_performance_mode', 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', "
269-
"'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer']");
271+
"'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer', 'extended_udma']");
270272
}
271273

272274
test_config.run_config.provider_options[key] = value;

onnxruntime/test/onnx/main.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ void usage() {
9090
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
9191
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
9292
"\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
93+
"\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
94+
"\t '0' (disabled), '1' (enabled). Default: '0'. \n"
9395
"\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
9496
"\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_type|cpu\" \n\n"
9597
"\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
@@ -612,7 +614,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
612614
std::string str = str_stream.str();
613615
ORT_THROW("Wrong value for htp_arch. select from: " + str);
614616
}
615-
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") {
617+
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || key == "extended_udma") {
616618
std::unordered_set<std::string> supported_options = {"0", "1"};
617619
if (supported_options.find(value) == supported_options.end()) {
618620
std::ostringstream str_stream;
@@ -626,7 +628,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
626628
"Wrong key type entered. Choose from options: ['backend_type', 'backend_path', "
627629
"'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', "
628630
"'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'op_packages', 'qnn_context_priority', "
629-
"'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization']");
631+
"'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'extended_udma']");
630632
}
631633

632634
qnn_options[key] = value;

onnxruntime/test/perftest/command_args_parser.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ ABSL_FLAG(std::string, i, "",
116116
" [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n"
117117
" [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n"
118118
" Defaults to '0' (disabled).\n"
119+
" [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
120+
" '0' (disabled), '1' (enabled). Default: '0'. \n"
119121
" [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n"
120122
"\n"
121123
" [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"

onnxruntime/test/perftest/ort_test_session.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
258258
"qnn_saver_path", "htp_graph_finalization_optimization_mode", "qnn_context_priority",
259259
"htp_arch", "enable_htp_fp16_precision", "offload_graph_io_quantization",
260260
"enable_htp_spill_fill_buffer", "enable_htp_shared_memory_allocator", "dump_json_qnn_graph",
261-
"json_qnn_graph_dir", "disable_file_mapped_weights", "htp_bf16_enable", "enable_vtcm_backup_buffer_sharing"});
261+
"json_qnn_graph_dir", "disable_file_mapped_weights", "htp_bf16_enable", "enable_vtcm_backup_buffer_sharing", "extended_udma"});
262+
262263
for (const auto& provider_option : provider_options) {
263264
const std::string& key = provider_option.first;
264265
const std::string& value = provider_option.second;
@@ -323,6 +324,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
323324
key == "enable_htp_spill_fill_buffer" ||
324325
key == "enable_htp_shared_memory_allocator" ||
325326
key == "dump_json_qnn_graph" ||
327+
key == "extended_udma" ||
326328
key == "disable_file_mapped_weights" ||
327329
key == "enable_vtcm_backup_buffer_sharing") {
328330
std::set<std::string> supported_options = {"0", "1"};

onnxruntime/test/providers/qnn/qnn_basic_test.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,27 @@ TEST_F(QnnHTPBackendTests, DumpJsonQNNGraph) {
13141314
std::filesystem::remove_all(dump_dir);
13151315
}
13161316

1317+
// Test extended UDMA mode on supported hardware (should run successfully)
1318+
TEST_F(QnnHTPBackendTests, ExtendedUdmaModeTest) {
1319+
// Create provider options with extended UDMA mode enabled
1320+
ProviderOptions options;
1321+
options["backend_type"] = "htp";
1322+
options["offload_graph_io_quantization"] = "0";
1323+
options["htp_arch"] = "81";
1324+
options["extended_udma"] = "1";
1325+
1326+
// Define a simple model with Add operation
1327+
auto input_defs = {TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
1328+
TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f)};
1329+
1330+
// Run the test - this should succeed because v81 supports extended UDMA
1331+
RunQnnModelTest(BuildOpTestCase<float>("Add", input_defs, {}, {}, kOnnxDomain),
1332+
options,
1333+
13,
1334+
ExpectedEPNodeAssignment::All,
1335+
0.008f);
1336+
}
1337+
13171338
// Test option for offloading quantization of graph inputs and dequantization of graph outputs to the CPU EP.
13181339
TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) {
13191340
// Returns a function that checks that the Q/DQ ops at the graph IO boundary are offloaded to CPU

0 commit comments

Comments
 (0)