sierra-research · touchaponk · Jan 12, 2026 · Jan 28, 2026 · Jan 28, 2026
@@ -0,0 +1,60 @@
+{
+  "model_name": "amity-sigma-v3r",
+  "model_organization": "Amity",
+  "submitting_organization": "Amity",
+  "submission_date": "2026-01-10",
+  "submission_type": "custom",
+  "contact_info": {
+    "email": "touchapon@amity.co",
+    "name": "Touchapon K",
+    "github": "amity-arac"
+  },
+  "is_new": true,
+  "trajectories_available": true,
+  "references": [
+    {
+      "title": "Amity Sigma Thinking Model",
+      "url": "https://huggingface.co/amityco/amity-sigma-thinking-v3r",
+      "type": "huggingface"
+    },
+    {
+      "title": "ROAD: Auto Optimization for Agentic Tasks",
+      "url": "https://arxiv.org/abs/2512.24040",
+      "type": "paper"
+    }
+  ],
+  "results": {
+    "retail": {
+      "pass_1": 78.51,
+      "pass_2": 67.40,
+      "pass_3": 60.53,
+      "pass_4": 56.14,
+      "cost": null
+    },
+    "airline": {
+      "pass_1": 55.50,
+      "pass_2": 45.00,
+      "pass_3": 38.50,
+      "pass_4": 34.00,
+      "cost": null
+    },
+    "telecom": {
+      "pass_1": 32.89,
+      "pass_2": 24.71,
+      "pass_3": 19.96,
+      "pass_4": 16.67,
+      "cost": null
+    }
+  },
+  "methodology": {
+    "evaluation_date": "2026-01-03",
+    "tau2_bench_version": "v1.0",
+    "user_simulator": "gpt-4.1",
+    "notes": "Base model: Qwen3-4B-Thinking. Training approach: Synthetic data generation from Amity Sigma with human-in-the-loop feedback, followed by ROAD (Auto Optimization for Agentic Tasks) optimization techniques and GRPO finetuning.",
+    "verification": {
+      "modified_prompts": true,
+      "omitted_questions": false,
+      "details": "Modified retail domain policy.md for ROAD optimization. All tasks evaluated with 4 trials each. Model served via vLLM with deepseek_r1 reasoning parser."
+    }
+  }
+}