docker
diff --git a/‎README.md‎
Lines changed: 109 additions & 0 deletions b/‎README.md‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎cmd/dmrlet/commands/daemon.go‎
Lines changed: 86 additions & 0 deletions b/‎cmd/dmrlet/commands/daemon.go‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎cmd/dmrlet/commands/logs.go‎
Lines changed: 74 additions & 0 deletions b/‎cmd/dmrlet/commands/logs.go‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎cmd/dmrlet/commands/ps.go‎
Lines changed: 81 additions & 0 deletions b/‎cmd/dmrlet/commands/ps.go‎
Lines changed: 81 additions & 0 deletions
@@ -415,6 +415,115 @@ in the form of [a Helm chart and static YAML](charts/docker-model-runner/README.
 If you are interested in a specific Kubernetes use-case, please start a
 discussion on the issue tracker.
 
+## dmrlet: Container Orchestrator for AI Inference
+
+dmrlet is a purpose-built container orchestrator for AI inference workloads. Unlike Kubernetes, it focuses exclusively on running stateless inference containers with zero configuration overhead. Multi-GPU mapping "just works" without YAML, device plugins, or node selectors.
+
+### Key Features
+
+| Feature | Kubernetes | dmrlet |
+|---------|------------|--------|
+| Multi-GPU setup | Device plugins + node selectors + resource limits YAML | `dmrlet serve llama3 --gpus all` |
+| Config overhead | 50+ lines YAML minimum | Zero YAML, CLI-only |
+| Time to first inference | Minutes (pod scheduling, image pull) | Seconds (model already local) |
+| Model management | External (mount PVCs, manage yourself) | Integrated with Docker Model Runner store |
+
+### Building dmrlet
+
+```bash
+# Build the dmrlet binary
+go build -o dmrlet ./cmd/dmrlet
+
+# Verify it works
+./dmrlet --help
+```
+
+### Usage
+
+**Start the daemon:**
+```bash
+# Start in foreground
+dmrlet daemon
+
+# With custom socket path
+dmrlet daemon --socket /tmp/dmrlet.sock
+```
+
+**Serve a model:**
+```bash
+# Auto-detect backend and GPUs
+dmrlet serve llama3.2
+
+# Specify backend
+dmrlet serve llama3.2 --backend vllm
+
+# Specify GPU allocation
+dmrlet serve llama3.2 --gpus 0,1
+dmrlet serve llama3.2 --gpus all
+
+# Multiple replicas
+dmrlet serve llama3.2 --replicas 2
+
+# Backend-specific options
+dmrlet serve llama3.2 --ctx-size 4096      # llama.cpp context size
+dmrlet serve llama3.2 --gpu-memory 0.8     # vLLM GPU memory utilization
+```
+
+**List running models:**
+```bash
+dmrlet ps
+# MODEL          BACKEND    REPLICAS   GPUS      ENDPOINTS              STATUS
+# llama3.2       llama.cpp  1          [0,1,2,3] localhost:30000        healthy
+```
+
+**View logs:**
+```bash
+dmrlet logs llama3.2        # Last 100 lines
+dmrlet logs llama3.2 -f     # Follow logs
+```
+
+**Scale replicas:**
+```bash
+dmrlet scale llama3.2 4     # Scale to 4 replicas
+```
+
+**Stop a model:**
+```bash
+dmrlet stop llama3.2
+dmrlet stop --all           # Stop all models
+```
+
+**Check status:**
+```bash
+dmrlet status
+# DAEMON: running
+# SOCKET: /var/run/dmrlet.sock
+#
+# GPUS:
+#   GPU 0:  NVIDIA A100 80GB  81920MB  (in use: llama3.2)
+#   GPU 1:  NVIDIA A100 80GB  81920MB  (available)
+#
+# MODELS: 1 running
+```
+
+### Supported Backends
+
+- **llama.cpp** - Default backend for GGUF models
+- **vLLM** - High-throughput serving for safetensors models
+- **SGLang** - Fast serving with RadixAttention
+
+### Architecture
+
+```
+dmrlet daemon
+  ├── GPU Manager      - Auto-detect and allocate GPUs
+  ├── Container Manager - Docker-based container lifecycle
+  ├── Service Registry  - Endpoint discovery with load balancing
+  ├── Health Monitor    - Auto-restart unhealthy containers
+  ├── Auto-scaler       - Scale based on QPS/latency/GPU utilization
+  └── Log Aggregator    - Centralized log collection
+```
+
 ## Community
 
 For general questions and discussion, please use [Docker Model Runner's Slack channel](https://dockercommunity.slack.com/archives/C09H9P5E57B).
 
@@ -0,0 +1,86 @@
+package commands
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/signal"
+	"syscall"
+
+	"github.com/docker/model-runner/pkg/dmrlet/daemon"
+	"github.com/spf13/cobra"
+)
+
+var (
+	containerdAddress string
+	modelStorePath    string
+)
+
+func newDaemonCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "daemon",
+		Short: "Start the dmrlet daemon",
+		Long: `Start the dmrlet daemon process.
+
+The daemon manages inference containers, handles GPU allocation,
+and provides the API for other dmrlet commands.
+
+Examples:
+  # Start daemon with default settings
+  dmrlet daemon
+
+  # Start with custom socket path
+  dmrlet daemon --socket /tmp/dmrlet.sock
+
+  # Start with custom containerd address
+  dmrlet daemon --containerd /run/containerd/containerd.sock`,
+		RunE: runDaemon,
+	}
+
+	cmd.Flags().StringVar(&containerdAddress, "containerd", "/run/containerd/containerd.sock",
+		"Path to containerd socket")
+	cmd.Flags().StringVar(&modelStorePath, "store", "",
+		"Path to model store (default: ~/.docker/model-runner/models)")
+
+	return cmd
+}
+
+func runDaemon(cmd *cobra.Command, args []string) error {
+	config := daemon.DefaultConfig()
+	config.SocketPath = socketPath
+	config.ContainerdAddress = containerdAddress
+	if modelStorePath != "" {
+		config.ModelStorePath = modelStorePath
+	}
+
+	d, err := daemon.New(config)
+	if err != nil {
+		return fmt.Errorf("failed to create daemon: %w", err)
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Handle signals
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+
+	// Start daemon
+	if err := d.Start(ctx); err != nil {
+		return fmt.Errorf("failed to start daemon: %w", err)
+	}
+
+	fmt.Printf("dmrlet daemon started on %s\n", config.SocketPath)
+
+	// Wait for signal
+	sig := <-sigCh
+	fmt.Printf("\nReceived signal %v, shutting down...\n", sig)
+
+	// Graceful shutdown
+	if err := d.Stop(ctx); err != nil {
+		return fmt.Errorf("failed to stop daemon: %w", err)
+	}
+
+	fmt.Println("Daemon stopped")
+	return nil
+}
@@ -0,0 +1,74 @@
+package commands
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/signal"
+	"syscall"
+
+	"github.com/docker/model-runner/pkg/dmrlet/daemon"
+	"github.com/spf13/cobra"
+)
+
+var (
+	logsFollow bool
+	logsTail   int
+)
+
+func newLogsCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "logs MODEL",
+		Short: "View logs for a model",
+		Long: `View logs from the inference containers for a model.
+
+Examples:
+  # View last 100 lines
+  dmrlet logs llama3.2
+
+  # Follow logs in real-time
+  dmrlet logs llama3.2 -f
+
+  # View last 50 lines
+  dmrlet logs llama3.2 --tail 50`,
+		Args: cobra.ExactArgs(1),
+		RunE: runLogs,
+	}
+
+	cmd.Flags().BoolVarP(&logsFollow, "follow", "f", false, "Follow log output")
+	cmd.Flags().IntVar(&logsTail, "tail", 100, "Number of lines to show from the end")
+
+	return cmd
+}
+
+func runLogs(cmd *cobra.Command, args []string) error {
+	model := args[0]
+
+	client := daemon.NewClient(socketPath)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Handle Ctrl+C
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+	go func() {
+		<-sigCh
+		cancel()
+	}()
+
+	logChan, err := client.StreamLogs(ctx, model, logsTail, logsFollow)
+	if err != nil {
+		return fmt.Errorf("failed to get logs: %w", err)
+	}
+
+	for line := range logChan {
+		if line.Timestamp.IsZero() {
+			fmt.Print(line.Message)
+		} else {
+			fmt.Printf("[%s] %s\n", line.Timestamp.Format("2006-01-02 15:04:05"), line.Message)
+		}
+	}
+
+	return nil
+}
@@ -0,0 +1,81 @@
+package commands
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"text/tabwriter"
+
+	"github.com/docker/model-runner/pkg/dmrlet/daemon"
+	"github.com/spf13/cobra"
+)
+
+func newPsCmd() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "ps",
+		Short: "List running inference containers",
+		Long: `List all running inference containers.
+
+Examples:
+  # List all running models
+  dmrlet ps`,
+		RunE: runPs,
+	}
+
+	return cmd
+}
+
+func runPs(cmd *cobra.Command, args []string) error {
+	client := daemon.NewClient(socketPath)
+
+	resp, err := client.List(context.Background())
+	if err != nil {
+		return fmt.Errorf("failed to list models: %w", err)
+	}
+
+	if len(resp.Models) == 0 {
+		fmt.Println("No models running")
+		return nil
+	}
+
+	w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+	fmt.Fprintln(w, "MODEL\tBACKEND\tREPLICAS\tGPUS\tENDPOINTS\tSTATUS")
+
+	for _, m := range resp.Models {
+		gpus := formatGPUList(m.GPUs)
+		endpoints := formatEndpoints(m.Endpoints)
+
+		fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%s\t%s\n",
+			m.Model,
+			m.Backend,
+			m.Replicas,
+			gpus,
+			endpoints,
+			m.Status,
+		)
+	}
+
+	return w.Flush()
+}
+
+func formatGPUList(gpus []int) string {
+	if len(gpus) == 0 {
+		return "-"
+	}
+	strs := make([]string, len(gpus))
+	for i, g := range gpus {
+		strs[i] = fmt.Sprintf("%d", g)
+	}
+	return "[" + strings.Join(strs, ",") + "]"
+}
+
+func formatEndpoints(endpoints []string) string {
+	if len(endpoints) == 0 {
+		return "-"
+	}
+	if len(endpoints) == 1 {
+		return endpoints[0]
+	}
+	return strings.Join(endpoints, ",")
+}