Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ A complete infrastructure solution for deploying AI inference workloads on AWS u

- Custom Ubuntu 24.04 AMI with Docker, NVIDIA drivers, and GPU support
- Modular architecture: separate IAM, inference, and reusable modules
- Multi-model deployment: Qwen 3 0.6B and GPT-OSS 20B configurations
- Multi-model deployment: Qwen 3 0.6B, GPT-OSS 20B, and Gemma 3 27B configurations
- vLLM server with systemd integration and container lifecycle management
- GPU-enabled instances (g5.2xlarge) with automated provisioning
- Comprehensive security: EBS encryption, restrictive security groups, IAM best practices
Expand Down
34 changes: 26 additions & 8 deletions infrastructure/ai-inference/opentofu/inference/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ module "qwen3_0_6b" {
key_name = "qwen3_0_6b_key"

allowed_ip_addresses = var.allowed_ip_addresses
hugging_face_token = var.hugging_face_token
model = "Qwen/Qwen3-0.6B"
vllm_timeout = 360
vllm_version = "latest"
hugging_face_token = var.hugging_face_token
model = "Qwen/Qwen3-0.6B"
vllm_timeout = 360
vllm_version = "latest"
}

module "gpt_oss_20b" {
Expand All @@ -26,8 +26,26 @@ module "gpt_oss_20b" {
key_name = "gpt_oss_20b_key"

allowed_ip_addresses = var.allowed_ip_addresses
hugging_face_token = var.hugging_face_token
model = "openai/gpt-oss-20b"
vllm_timeout = 600
vllm_version = "v0.10.1"
hugging_face_token = var.hugging_face_token
model = "openai/gpt-oss-20b"
vllm_timeout = 600
vllm_version = "v0.10.1"
}

module "gemma_3_27b" {
source = "../modules/inference"

ami_id = data.aws_ami.ai_inference.id
instance_type = "g6.12xlarge"
vpc_id = data.aws_vpc.default.id
subnet_id = data.aws_subnets.default.ids[0]
iam_instance_profile = data.aws_iam_instance_profile.ai_inference_profile.name
key_name = "gemma_3_27b_key"

allowed_ip_addresses = var.allowed_ip_addresses
hugging_face_token = var.hugging_face_token
model = "google/gemma-3-27b-it"
vllm_args = "--tensor-parallel-size=4" # 4 GPUs
vllm_timeout = 900
vllm_version = "latest"
}