|
| 1 | +package commands |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "os" |
| 7 | + "os/signal" |
| 8 | + "syscall" |
| 9 | + |
| 10 | + "github.com/docker/model-runner/pkg/dmrlet/inference" |
| 11 | + "github.com/spf13/cobra" |
| 12 | +) |
| 13 | + |
| 14 | +type serveFlags struct { |
| 15 | + port int |
| 16 | + backend string |
| 17 | + gpu bool |
| 18 | + detach bool |
| 19 | +} |
| 20 | + |
| 21 | +func newServeCmd() *cobra.Command { |
| 22 | + flags := &serveFlags{} |
| 23 | + |
| 24 | + cmd := &cobra.Command{ |
| 25 | + Use: "serve MODEL", |
| 26 | + Short: "Serve a model (pull if needed, start container, wait for ready)", |
| 27 | + Long: `Serve a model by pulling it if needed, starting an inference container, |
| 28 | +and waiting for it to be ready. The model will be exposed on an OpenAI-compatible API. |
| 29 | +
|
| 30 | +Examples: |
| 31 | + dmrlet serve ai/smollm2 |
| 32 | + dmrlet serve ai/smollm2 --port 8080 |
| 33 | + dmrlet serve ai/smollm2 --gpu |
| 34 | + dmrlet serve ai/smollm2 --backend vllm --gpu |
| 35 | + dmrlet serve ai/smollm2 -d # detached mode`, |
| 36 | + Args: cobra.ExactArgs(1), |
| 37 | + RunE: func(cmd *cobra.Command, args []string) error { |
| 38 | + return runServe(cmd, args[0], flags) |
| 39 | + }, |
| 40 | + } |
| 41 | + |
| 42 | + cmd.Flags().IntVarP(&flags.port, "port", "p", 0, "Port to expose the API on (auto-allocated if not specified)") |
| 43 | + cmd.Flags().StringVarP(&flags.backend, "backend", "b", "", "Inference backend (llama-server, vllm)") |
| 44 | + cmd.Flags().BoolVar(&flags.gpu, "gpu", false, "Enable GPU support") |
| 45 | + cmd.Flags().BoolVarP(&flags.detach, "detach", "d", false, "Run in detached mode (return immediately)") |
| 46 | + |
| 47 | + return cmd |
| 48 | +} |
| 49 | + |
| 50 | +func runServe(cmd *cobra.Command, modelRef string, flags *serveFlags) error { |
| 51 | + ctx := cmd.Context() |
| 52 | + |
| 53 | + if err := initManager(ctx); err != nil { |
| 54 | + return fmt.Errorf("initializing manager: %w", err) |
| 55 | + } |
| 56 | + |
| 57 | + opts := inference.ServeOptions{ |
| 58 | + Port: flags.port, |
| 59 | + Backend: flags.backend, |
| 60 | + GPU: flags.gpu, |
| 61 | + Detach: flags.detach, |
| 62 | + Progress: os.Stdout, |
| 63 | + } |
| 64 | + |
| 65 | + running, err := manager.Serve(ctx, modelRef, opts) |
| 66 | + if err != nil { |
| 67 | + return fmt.Errorf("serving model: %w", err) |
| 68 | + } |
| 69 | + |
| 70 | + cmd.Printf("\nModel %s is ready!\n", modelRef) |
| 71 | + cmd.Printf("Endpoint: %s\n", running.Endpoint) |
| 72 | + cmd.Printf("Backend: %s\n", running.Backend) |
| 73 | + cmd.Printf("Port: %d\n", running.Port) |
| 74 | + cmd.Println() |
| 75 | + cmd.Printf("Example usage:\n") |
| 76 | + cmd.Printf(" curl %s/chat/completions -H 'Content-Type: application/json' \\\n", running.Endpoint) |
| 77 | + cmd.Printf(" -d '{\"model\":\"%s\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello!\"}]}'\n", modelRef) |
| 78 | + |
| 79 | + if flags.detach { |
| 80 | + return nil |
| 81 | + } |
| 82 | + |
| 83 | + // Wait for interrupt signal |
| 84 | + cmd.Println() |
| 85 | + cmd.Println("Press Ctrl+C to stop the model...") |
| 86 | + |
| 87 | + sigCh := make(chan os.Signal, 1) |
| 88 | + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) |
| 89 | + <-sigCh |
| 90 | + |
| 91 | + cmd.Println() |
| 92 | + cmd.Println("Stopping model...") |
| 93 | + |
| 94 | + if err := manager.Stop(context.Background(), modelRef); err != nil { |
| 95 | + return fmt.Errorf("stopping model: %w", err) |
| 96 | + } |
| 97 | + |
| 98 | + cmd.Println("Model stopped.") |
| 99 | + return nil |
| 100 | +} |
0 commit comments