lukebrady · lukebrady · Sep 5, 2025 · Sep 3, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ A complete infrastructure solution for deploying AI inference workloads on AWS u
 
 - Custom Ubuntu 24.04 AMI with Docker, NVIDIA drivers, and GPU support
 - Modular architecture: separate IAM, inference, and reusable modules
-- Multi-model deployment: Qwen 3 0.6B and GPT-OSS 20B configurations
+- Multi-model deployment: Qwen 3 0.6B, GPT-OSS 20B, and Gemma 3 27B configurations
 - vLLM server with systemd integration and container lifecycle management
 - GPU-enabled instances (g5.2xlarge) with automated provisioning
 - Comprehensive security: EBS encryption, restrictive security groups, IAM best practices

diff --git a/infrastructure/ai-inference/opentofu/inference/main.tf b/infrastructure/ai-inference/opentofu/inference/main.tf
@@ -10,10 +10,10 @@ module "qwen3_0_6b" {
   key_name             = "qwen3_0_6b_key"
 
   allowed_ip_addresses = var.allowed_ip_addresses
-  hugging_face_token = var.hugging_face_token
-  model              = "Qwen/Qwen3-0.6B"
-  vllm_timeout       = 360
-  vllm_version       = "latest"
+  hugging_face_token   = var.hugging_face_token
+  model                = "Qwen/Qwen3-0.6B"
+  vllm_timeout         = 360
+  vllm_version         = "latest"
 }
 
 module "gpt_oss_20b" {
@@ -26,8 +26,26 @@ module "gpt_oss_20b" {
   key_name             = "gpt_oss_20b_key"
 
   allowed_ip_addresses = var.allowed_ip_addresses
-  hugging_face_token = var.hugging_face_token
-  model              = "openai/gpt-oss-20b"
-  vllm_timeout       = 600
-  vllm_version       = "v0.10.1"
+  hugging_face_token   = var.hugging_face_token
+  model                = "openai/gpt-oss-20b"
+  vllm_timeout         = 600
+  vllm_version         = "v0.10.1"
+}
+
+module "gemma_3_27b" {
+  source = "../modules/inference"
+
+  ami_id               = data.aws_ami.ai_inference.id
+  instance_type        = "g6.12xlarge"
+  vpc_id               = data.aws_vpc.default.id
+  subnet_id            = data.aws_subnets.default.ids[0]
+  iam_instance_profile = data.aws_iam_instance_profile.ai_inference_profile.name
+  key_name             = "gemma_3_27b_key"
+
+  allowed_ip_addresses = var.allowed_ip_addresses
+  hugging_face_token   = var.hugging_face_token
+  model                = "google/gemma-3-27b-it"
+  vllm_args            = "--tensor-parallel-size=4" # 4 GPUs
+  vllm_timeout         = 900
+  vllm_version         = "latest"
 }