NexaAI · mengshengwu · Feb 5, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/runner/cmd/nexa-cli/infer.go b/runner/cmd/nexa-cli/infer.go
@@ -423,9 +423,8 @@ func inferLLM(manifest *types.ModelManifest, quant string) error {
 		PluginID:  manifest.PluginId,
 		DeviceID:  manifest.DeviceId,
 		Config: nexa_sdk.ModelConfig{
-			NCtx:         nctx,
-			NGpuLayers:   ngl,
-			SystemPrompt: systemPrompt, // TODO: align npu
+			NCtx:       nctx,
+			NGpuLayers: ngl,
 		},
 	})
 	spin.Stop()
@@ -595,9 +594,8 @@ func inferVLM(manifest *types.ModelManifest, quant string) error {
 		PluginID:      manifest.PluginId,
 		DeviceID:      manifest.DeviceId,
 		Config: nexa_sdk.ModelConfig{
-			NCtx:         nctx,
-			NGpuLayers:   ngl,
-			SystemPrompt: systemPrompt,
+			NCtx:       nctx,
+			NGpuLayers: ngl,
 		},
 	})
 	spin.Stop()

diff --git a/runner/internal/types/model.go b/runner/internal/types/model.go
@@ -86,9 +86,6 @@ func (m ModelManifest) GetSize() int64 {
 type ModelParam struct {
 	NCtx       int32
 	NGpuLayers int32
-
-	// npu only
-	SystemPrompt string
 }
 
 type DownloadInfo struct {

diff --git a/runner/nexa-sdk/common.go b/runner/nexa-sdk/common.go
@@ -277,5 +277,4 @@ type ModelConfig struct {
 	NGpuLayers          int32
 	ChatTemplatePath    string
 	ChatTemplateContent string
-	SystemPrompt        string
 }
diff --git a/runner/nexa-sdk/llm.go b/runner/nexa-sdk/llm.go
@@ -82,10 +82,6 @@ func (lci LlmCreateInput) toCPtr() *C.ml_LlmCreateInput {
 	if lci.Config.ChatTemplateContent != "" {
 		cPtr.config.chat_template_content = C.CString(lci.Config.ChatTemplateContent)
 	}
-	// Add system prompt support
-	if lci.Config.SystemPrompt != "" {
-		cPtr.config.system_prompt = C.CString(lci.Config.SystemPrompt)
-	}
 
 	return cPtr
 }
@@ -114,9 +110,6 @@ func freeLlmCreateInput(cPtr *C.ml_LlmCreateInput) {
 		if cPtr.config.chat_template_content != nil {
 			C.free(unsafe.Pointer(cPtr.config.chat_template_content))
 		}
-		if cPtr.config.system_prompt != nil {
-			C.free(unsafe.Pointer(cPtr.config.system_prompt))
-		}
 
 		// Free the main structure
 		C.free(unsafe.Pointer(cPtr))

diff --git a/runner/nexa-sdk/vlm.go b/runner/nexa-sdk/vlm.go
@@ -76,10 +76,6 @@ func (vci VlmCreateInput) toCPtr() *C.ml_VlmCreateInput {
 	if vci.Config.ChatTemplateContent != "" {
 		cPtr.config.chat_template_content = C.CString(vci.Config.ChatTemplateContent)
 	}
-	// Add system prompt support
-	if vci.Config.SystemPrompt != "" {
-		cPtr.config.system_prompt = C.CString(vci.Config.SystemPrompt)
-	}
 
 	return cPtr
 }
@@ -110,9 +106,6 @@ func freeVlmCreateInput(cPtr *C.ml_VlmCreateInput) {
 		if cPtr.config.chat_template_content != nil {
 			C.free(unsafe.Pointer(cPtr.config.chat_template_content))
 		}
-		if cPtr.config.system_prompt != nil {
-			C.free(unsafe.Pointer(cPtr.config.system_prompt))
-		}
 
 		// Free the main structure
 		C.free(unsafe.Pointer(cPtr))

diff --git a/runner/server/docs/swagger.yaml b/runner/server/docs/swagger.yaml
@@ -881,7 +881,7 @@ components:
         stream_format:
           type: string
           enum: [wav, sse]
-          description: "sse" returns Server-Sent Events stream; otherwise returns binary WAV
+          description: '"sse" returns Server-Sent Events stream; otherwise returns binary WAV'
         speed:
           type: number
           format: float
@@ -905,7 +905,7 @@ components:
           description: The audio file to transcribe. Omit for warm up (returns null).
         stream:
           type: string
-          description: "true" is not supported and returns 400
+          description: '"true" is not supported and returns 400'
         language:
           type: string
           description: The language of the input audio

diff --git a/runner/server/handler/chat.go b/runner/server/handler/chat.go
@@ -80,6 +80,14 @@ func defaultChatCompletionRequest() ChatCompletionRequest {
 	}
 }
 
+func onlySystemMessage(param ChatCompletionRequest) bool {
+	if len(param.Messages) != 1 {
+		return false
+	}
+	r := param.Messages[0].GetRole()
+	return r != nil && *r == "system"
+}
+
 func ChatCompletions(c *gin.Context) {
 	param := defaultChatCompletionRequest()
 	if err := c.ShouldBindJSON(&param); err != nil {
@@ -117,11 +125,8 @@ func ChatCompletions(c *gin.Context) {
 }
 
 func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
-	// Build message list for LLM template
-	var systemPrompt string
 	messages := make([]nexa_sdk.LlmChatMessage, 0, len(param.Messages))
 	for _, msg := range param.Messages {
-		// tool call message
 		if toolCalls := msg.GetToolCalls(); len(toolCalls) > 0 {
 			for _, tc := range toolCalls {
 				messages = append(messages, nexa_sdk.LlmChatMessage{
@@ -133,7 +138,6 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 			continue
 		}
 
-		// tool call response message
 		if toolResp := msg.GetToolCallID(); toolResp != nil {
 			messages = append(messages, nexa_sdk.LlmChatMessage{
 				Role:    nexa_sdk.LLMRole(*msg.GetRole()),
@@ -144,21 +148,13 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 
 		switch content := msg.GetContent().AsAny().(type) {
 		case *string:
-			// NOTE: patch for npu
-			if *msg.GetRole() == "system" {
-				systemPrompt += *content
-			}
 			messages = append(messages, nexa_sdk.LlmChatMessage{
 				Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 				Content: *content,
 			})
 
 		case *[]openai.ChatCompletionContentPartTextParam:
 			for _, ct := range *content {
-				// NOTE: patch for npu
-				if *msg.GetRole() == "system" {
-					systemPrompt += ct.Text
-				}
 				messages = append(messages, nexa_sdk.LlmChatMessage{
 					Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 					Content: ct.Text,
@@ -168,10 +164,6 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					messages = append(messages, nexa_sdk.LlmChatMessage{
 						Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 						Content: *ct.GetText(),
@@ -186,10 +178,6 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					messages = append(messages, nexa_sdk.LlmChatMessage{
 						Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 						Content: *ct.GetText(),
@@ -218,10 +206,9 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 
 	samplerConfig := parseSamplerConfig(param)
 
-	// Get LLM instance
 	p, err := service.KeepAliveGet[nexa_sdk.LLM](
 		string(param.Model),
-		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl, SystemPrompt: systemPrompt},
+		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl},
 		c.GetHeader("Nexa-KeepCache") != "true",
 	)
 	if errors.Is(err, os.ErrNotExist) {
@@ -231,13 +218,11 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 		c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error(), "code": nexa_sdk.SDKErrorCode(err)})
 		return
 	}
-	// Empty request for warm up
-	if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
+	if len(param.Messages) == 0 || onlySystemMessage(param) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}
 
-	// Format prompt using chat template
 	formatted, err := p.ApplyChatTemplate(nexa_sdk.LlmApplyChatTemplateInput{
 		Messages:            messages,
 		Tools:               tools,
@@ -421,11 +406,8 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 }
 
 func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
-	// Build message list for VLM template
-	var systemPrompt string
 	messages := make([]nexa_sdk.VlmChatMessage, 0, len(param.Messages))
 	for _, msg := range param.Messages {
-		// tool call message
 		if toolCalls := msg.GetToolCalls(); len(toolCalls) > 0 {
 			contents := make([]nexa_sdk.VlmContent, 0, len(toolCalls))
 			for _, tc := range toolCalls {
@@ -442,7 +424,6 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 			continue
 		}
 
-		// tool call response message
 		if toolResp := msg.GetToolCallID(); toolResp != nil {
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role: nexa_sdk.VlmRole(*msg.GetRole()),
@@ -456,9 +437,6 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 
 		switch content := msg.GetContent().AsAny().(type) {
 		case *string:
-			if *msg.GetRole() == "system" {
-				systemPrompt += *content
-			}
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role: nexa_sdk.VlmRole(*msg.GetRole()),
 				Contents: []nexa_sdk.VlmContent{
@@ -468,32 +446,22 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 
 		case *[]openai.ChatCompletionContentPartTextParam:
 			contents := make([]nexa_sdk.VlmContent, 0, len(*content))
-
 			for _, ct := range *content {
-				if *msg.GetRole() == "system" {
-					systemPrompt += ct.Text
-				}
 				contents = append(contents, nexa_sdk.VlmContent{
 					Type: nexa_sdk.VlmContentTypeText,
 					Text: ct.Text,
 				})
 			}
-
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role:     nexa_sdk.VlmRole(*msg.GetRole()),
 				Contents: contents,
 			})
 
 		case *[]openai.ChatCompletionContentPartUnionParam:
 			contents := make([]nexa_sdk.VlmContent, 0, len(*content))
-
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					contents = append(contents, nexa_sdk.VlmContent{
 						Type: nexa_sdk.VlmContentTypeText,
 						Text: *ct.GetText(),
@@ -528,22 +496,16 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 					return
 				}
 			}
-
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role:     nexa_sdk.VlmRole(*msg.GetRole()),
 				Contents: contents,
 			})
 
 		case *[]openai.ChatCompletionAssistantMessageParamContentArrayOfContentPartUnion:
 			contents := make([]nexa_sdk.VlmContent, 0, len(*content))
-
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					contents = append(contents, nexa_sdk.VlmContent{
 						Type: nexa_sdk.VlmContentTypeText,
 						Text: *ct.GetText(),
@@ -577,10 +539,9 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 
 	samplerConfig := parseSamplerConfig(param)
 
-	// Get VLM instance
 	p, err := service.KeepAliveGet[nexa_sdk.VLM](
 		string(param.Model),
-		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl, SystemPrompt: systemPrompt},
+		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl},
 		c.GetHeader("Nexa-KeepCache") != "true",
 	)
 	if errors.Is(err, os.ErrNotExist) {
@@ -590,9 +551,7 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 		c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error(), "code": nexa_sdk.SDKErrorCode(err)})
 		return
 	}
-
-	// Empty request for warm up, just reset model state
-	if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
+	if len(param.Messages) == 0 || onlySystemMessage(param) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}

diff --git a/runner/server/service/keepalive.go b/runner/server/service/keepalive.go
@@ -169,9 +169,8 @@ func keepAliveGet[T any](name string, param types.ModelParam, reset bool) (any,
 			ModelName: manifest.ModelName,
 			ModelPath: modelfile,
 			Config: nexa_sdk.ModelConfig{
-				NCtx:         param.NCtx,
-				NGpuLayers:   param.NGpuLayers,
-				SystemPrompt: param.SystemPrompt,
+				NCtx:       param.NCtx,
+				NGpuLayers: param.NGpuLayers,
 			},
 			PluginID: manifest.PluginId,
 			DeviceID: manifest.DeviceId,
@@ -191,9 +190,8 @@ func keepAliveGet[T any](name string, param types.ModelParam, reset bool) (any,
 			MmprojPath:    mmproj,
 			TokenizerPath: tokenizer,
 			Config: nexa_sdk.ModelConfig{
-				NCtx:         param.NCtx,
-				NGpuLayers:   param.NGpuLayers,
-				SystemPrompt: param.SystemPrompt,
+				NCtx:       param.NCtx,
+				NGpuLayers: param.NGpuLayers,
 			},
 			PluginID: manifest.PluginId,
 			DeviceID: manifest.DeviceId,