Skip to content

Commit 1b67be7

Browse files
benshi34hongjin-su
andauthored
Submit Toolorchestra to leaderboard - Revised (#119)
* upload * update * Update submission.json * toolorchestra submission + custom verification logic * address comments --------- Co-authored-by: hongjin-su <114016954+hongjin-su@users.noreply.github.com>
1 parent 5704c21 commit 1b67be7

File tree

8 files changed

+171712
-10
lines changed

8 files changed

+171712
-10
lines changed
31.1 KB
Loading

web/leaderboard/public/submissions/manifest.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"qwen3-max_qwen_2024_09_23",
1515
"gemini-3-pro_google_2025-11-18",
1616
"deepseek-v3.2_deepseek_2025-12-01",
17-
"qwen3-max_qwen_2025-10-30"
17+
"qwen3-max_qwen_2025-10-30",
18+
"toolorchestra_nvidia_2025-12-02"
1819
],
19-
"last_updated": "2025-12-01T00:00:00Z"
20+
"last_updated": "2025-12-17T00:00:00Z"
2021
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"model_name": "Nemotron-Orchestrator-8B",
3+
"model_organization": "NVIDIA",
4+
"submitting_organization": "NVIDIA",
5+
"submission_date": "2025-12-18",
6+
"contact_info": {
7+
"email": "sdiao@nvidia.com",
8+
"name": "Shizhe Diao"
9+
},
10+
"is_new": true,
11+
"trajectories_available": true,
12+
"results": {
13+
"retail": {
14+
"pass_1": 84.2,
15+
"pass_2": null,
16+
"pass_3": null,
17+
"pass_4": null
18+
},
19+
"airline": {
20+
"pass_1": 56.0,
21+
"pass_2": null,
22+
"pass_3": null,
23+
"pass_4": null
24+
},
25+
"telecom": {
26+
"pass_1": 88.6,
27+
"pass_2": null,
28+
"pass_3": null,
29+
"pass_4": null
30+
}
31+
},
32+
"submission_type": "custom",
33+
"methodology": {
34+
"evaluation_date": "2025-09-15",
35+
"tau2_bench_version": "0.1.3",
36+
"user_simulator": "gpt-5",
37+
"notes": "This submission uses an orchestrator that selects between strong and weak models based on task complexity. See our GitHub repo for full implementation details.",
38+
"verification": {
39+
"modified_prompts": true,
40+
"omitted_questions": false,
41+
"details": "Modified the agent system prompt to support orchestrator framework. No questions were omitted."
42+
}
43+
},
44+
"references": [
45+
{
46+
"title": "ToolOrchestra: Elevating Intelligence via Efficient Model and Tool Orchestration",
47+
"url": "https://arxiv.org/abs/2511.21689",
48+
"type": "paper"
49+
},
50+
{
51+
"title": "ToolOrchestra GitHub Repository",
52+
"url": "https://github.com/NVlabs/ToolOrchestra",
53+
"type": "github"
54+
}
55+
]
56+
}

web/leaderboard/public/submissions/toolorchestra_nvidia_2025-12-02/trajectories/toolorchestra_airline_gpt-5_1trial.json

Lines changed: 24200 additions & 0 deletions
Large diffs are not rendered by default.

web/leaderboard/public/submissions/toolorchestra_nvidia_2025-12-02/trajectories/toolorchestra_retail_gpt-5_1trial.json

Lines changed: 49094 additions & 0 deletions
Large diffs are not rendered by default.

web/leaderboard/public/submissions/toolorchestra_nvidia_2025-12-02/trajectories/toolorchestra_telecom_gpt-5_1trial.json

Lines changed: 98343 additions & 0 deletions
Large diffs are not rendered by default.

web/leaderboard/src/components/Leaderboard.jsx

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,11 @@ const Leaderboard = () => {
143143
organization: submission.submitting_organization,
144144
userSimulator: submission.methodology?.user_simulator || null,
145145
// Add verification status
146+
// For 'custom' submissions, we relax the modified_prompts constraint
147+
// Custom submissions are allowed to modify prompts as long as they have trajectories and don't omit questions
146148
isVerified: submission.trajectories_available &&
147-
submission.methodology?.verification?.modified_prompts === false &&
148-
submission.methodology?.verification?.omitted_questions === false,
149+
submission.methodology?.verification?.omitted_questions === false &&
150+
(submission.submission_type === 'custom' || submission.methodology?.verification?.modified_prompts === false),
149151
verificationDetails: submission.methodology?.verification || null,
150152
// Submission type: 'standard' (default) or 'custom'
151153
submissionType: submission.submission_type || 'standard'
@@ -800,10 +802,13 @@ const Leaderboard = () => {
800802
{(model.organization === 'Alibaba' || model.organization === 'Qwen') && (
801803
<img src={`${import.meta.env.BASE_URL}qwen-color.png`} alt="Qwen" className="logo-img" />
802804
)}
803-
{model.organization === 'Google' && (
804-
<img src={`${import.meta.env.BASE_URL}Google__G__logo.svg.png`} alt="Google" className="logo-img" />
805-
)}
806-
</div>
805+
{model.organization === 'Google' && (
806+
<img src={`${import.meta.env.BASE_URL}Google__G__logo.svg.png`} alt="Google" className="logo-img" />
807+
)}
808+
{model.organization === 'NVIDIA' && (
809+
<img src={`${import.meta.env.BASE_URL}Logo-nvidia-transparent-PNG.png`} alt="NVIDIA" className="logo-img" />
810+
)}
811+
</div>
807812
<span className="org-name">{model.organization}</span>
808813
</div>
809814
</td>
@@ -1076,8 +1081,8 @@ const Leaderboard = () => {
10761081
<div className="verification-status">
10771082
<div className="verification-indicator">
10781083
{selectedSubmission.trajectories_available &&
1079-
selectedSubmission.methodology.verification.modified_prompts === false &&
1080-
selectedSubmission.methodology.verification.omitted_questions === false ? (
1084+
selectedSubmission.methodology.verification.omitted_questions === false &&
1085+
(selectedSubmission.submission_type === 'custom' || selectedSubmission.methodology.verification.modified_prompts === false) ? (
10811086
<span className="verified">✅ Verified</span>
10821087
) : (
10831088
<span className="unverified">⚠️ Unverified</span>

web/leaderboard/src/components/TrajectoryVisualizer.jsx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ const TrajectoryVisualizer = () => {
140140
],
141141
'Qwen3-Max-Thinking-Preview': [
142142
'{domain}_llm_agent_qwen3-max-2025-10-30_user_simulator_gpt-4.1-2025-04-14.json'
143+
],
144+
'ToolOrchestra': [
145+
'toolorchestra_{domain}_gpt-5_1trial.json'
143146
]
144147
}
145148

0 commit comments

Comments
 (0)