Merge pull request #1036 from NexaAI/refactor/mengsheng/npu-systemp-prompt

mengshengwu · web-flow · commit b98e16c710e5 · 2026-02-05T15:17:12.000+08:00
refactor: remove  depecated system prompt patch for npu
diff --git a/runner/cmd/nexa-cli/infer.go b/runner/cmd/nexa-cli/infer.go
@@ -423,9 +423,8 @@ func inferLLM(manifest *types.ModelManifest, quant string) error {
 		PluginID:  manifest.PluginId,
 		DeviceID:  manifest.DeviceId,
 		Config: nexa_sdk.ModelConfig{
-			NCtx:         nctx,
-			NGpuLayers:   ngl,
-			SystemPrompt: systemPrompt, // TODO: align npu
+			NCtx:       nctx,
+			NGpuLayers: ngl,
 		},
 	})
 	spin.Stop()
@@ -595,9 +594,8 @@ func inferVLM(manifest *types.ModelManifest, quant string) error {
 		PluginID:      manifest.PluginId,
 		DeviceID:      manifest.DeviceId,
 		Config: nexa_sdk.ModelConfig{
-			NCtx:         nctx,
-			NGpuLayers:   ngl,
-			SystemPrompt: systemPrompt,
+			NCtx:       nctx,
+			NGpuLayers: ngl,
 		},
 	})
 	spin.Stop()
diff --git a/runner/internal/types/model.go b/runner/internal/types/model.go
@@ -86,9 +86,6 @@ func (m ModelManifest) GetSize() int64 {
 type ModelParam struct {
 	NCtx       int32
 	NGpuLayers int32
-
-	// npu only
-	SystemPrompt string
 }
 
 type DownloadInfo struct {
diff --git a/runner/nexa-sdk/common.go b/runner/nexa-sdk/common.go
@@ -277,5 +277,4 @@ type ModelConfig struct {
 	NGpuLayers          int32
 	ChatTemplatePath    string
 	ChatTemplateContent string
-	SystemPrompt        string
 }
diff --git a/runner/nexa-sdk/llm.go b/runner/nexa-sdk/llm.go
@@ -82,10 +82,6 @@ func (lci LlmCreateInput) toCPtr() *C.ml_LlmCreateInput {
 	if lci.Config.ChatTemplateContent != "" {
 		cPtr.config.chat_template_content = C.CString(lci.Config.ChatTemplateContent)
 	}
-	// Add system prompt support
-	if lci.Config.SystemPrompt != "" {
-		cPtr.config.system_prompt = C.CString(lci.Config.SystemPrompt)
-	}
 
 	return cPtr
 }
@@ -114,9 +110,6 @@ func freeLlmCreateInput(cPtr *C.ml_LlmCreateInput) {
 		if cPtr.config.chat_template_content != nil {
 			C.free(unsafe.Pointer(cPtr.config.chat_template_content))
 		}
-		if cPtr.config.system_prompt != nil {
-			C.free(unsafe.Pointer(cPtr.config.system_prompt))
-		}
 
 		// Free the main structure
 		C.free(unsafe.Pointer(cPtr))
diff --git a/runner/nexa-sdk/vlm.go b/runner/nexa-sdk/vlm.go
@@ -76,10 +76,6 @@ func (vci VlmCreateInput) toCPtr() *C.ml_VlmCreateInput {
 	if vci.Config.ChatTemplateContent != "" {
 		cPtr.config.chat_template_content = C.CString(vci.Config.ChatTemplateContent)
 	}
-	// Add system prompt support
-	if vci.Config.SystemPrompt != "" {
-		cPtr.config.system_prompt = C.CString(vci.Config.SystemPrompt)
-	}
 
 	return cPtr
 }
@@ -110,9 +106,6 @@ func freeVlmCreateInput(cPtr *C.ml_VlmCreateInput) {
 		if cPtr.config.chat_template_content != nil {
 			C.free(unsafe.Pointer(cPtr.config.chat_template_content))
 		}
-		if cPtr.config.system_prompt != nil {
-			C.free(unsafe.Pointer(cPtr.config.system_prompt))
-		}
 
 		// Free the main structure
 		C.free(unsafe.Pointer(cPtr))
diff --git a/runner/server/docs/swagger.yaml b/runner/server/docs/swagger.yaml
@@ -881,7 +881,7 @@ components:
         stream_format:
           type: string
           enum: [wav, sse]
-          description: "sse" returns Server-Sent Events stream; otherwise returns binary WAV
+          description: '"sse" returns Server-Sent Events stream; otherwise returns binary WAV'
         speed:
           type: number
           format: float
@@ -905,7 +905,7 @@ components:
           description: The audio file to transcribe. Omit for warm up (returns null).
         stream:
           type: string
-          description: "true" is not supported and returns 400
+          description: '"true" is not supported and returns 400'
         language:
           type: string
           description: The language of the input audio
diff --git a/runner/server/handler/chat.go b/runner/server/handler/chat.go
@@ -80,6 +80,17 @@ func defaultChatCompletionRequest() ChatCompletionRequest {
 	}
 }
 
+func isWarmupRequest(param ChatCompletionRequest) bool {
+	if len(param.Messages) == 0 {
+		return true
+	}
+	if len(param.Messages) != 1 {
+		return false
+	}
+	r := param.Messages[0].GetRole()
+	return r != nil && *r == "system"
+}
+
 func ChatCompletions(c *gin.Context) {
 	param := defaultChatCompletionRequest()
 	if err := c.ShouldBindJSON(&param); err != nil {
@@ -117,11 +128,8 @@ func ChatCompletions(c *gin.Context) {
 }
 
 func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
-	// Build message list for LLM template
-	var systemPrompt string
 	messages := make([]nexa_sdk.LlmChatMessage, 0, len(param.Messages))
 	for _, msg := range param.Messages {
-		// tool call message
 		if toolCalls := msg.GetToolCalls(); len(toolCalls) > 0 {
 			for _, tc := range toolCalls {
 				messages = append(messages, nexa_sdk.LlmChatMessage{
@@ -133,7 +141,6 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 			continue
 		}
 
-		// tool call response message
 		if toolResp := msg.GetToolCallID(); toolResp != nil {
 			messages = append(messages, nexa_sdk.LlmChatMessage{
 				Role:    nexa_sdk.LLMRole(*msg.GetRole()),
@@ -144,21 +151,13 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 
 		switch content := msg.GetContent().AsAny().(type) {
 		case *string:
-			// NOTE: patch for npu
-			if *msg.GetRole() == "system" {
-				systemPrompt += *content
-			}
 			messages = append(messages, nexa_sdk.LlmChatMessage{
 				Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 				Content: *content,
 			})
 
 		case *[]openai.ChatCompletionContentPartTextParam:
 			for _, ct := range *content {
-				// NOTE: patch for npu
-				if *msg.GetRole() == "system" {
-					systemPrompt += ct.Text
-				}
 				messages = append(messages, nexa_sdk.LlmChatMessage{
 					Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 					Content: ct.Text,
@@ -168,10 +167,6 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					messages = append(messages, nexa_sdk.LlmChatMessage{
 						Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 						Content: *ct.GetText(),
@@ -186,10 +181,6 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					messages = append(messages, nexa_sdk.LlmChatMessage{
 						Role:    nexa_sdk.LLMRole(*msg.GetRole()),
 						Content: *ct.GetText(),
@@ -218,10 +209,9 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 
 	samplerConfig := parseSamplerConfig(param)
 
-	// Get LLM instance
 	p, err := service.KeepAliveGet[nexa_sdk.LLM](
 		string(param.Model),
-		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl, SystemPrompt: systemPrompt},
+		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl},
 		c.GetHeader("Nexa-KeepCache") != "true",
 	)
 	if errors.Is(err, os.ErrNotExist) {
@@ -231,13 +221,11 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 		c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error(), "code": nexa_sdk.SDKErrorCode(err)})
 		return
 	}
-	// Empty request for warm up
-	if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
+	if isWarmupRequest(param) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}
 
-	// Format prompt using chat template
 	formatted, err := p.ApplyChatTemplate(nexa_sdk.LlmApplyChatTemplateInput{
 		Messages:            messages,
 		Tools:               tools,
@@ -421,11 +409,8 @@ func chatCompletionsLLM(c *gin.Context, param ChatCompletionRequest) {
 }
 
 func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
-	// Build message list for VLM template
-	var systemPrompt string
 	messages := make([]nexa_sdk.VlmChatMessage, 0, len(param.Messages))
 	for _, msg := range param.Messages {
-		// tool call message
 		if toolCalls := msg.GetToolCalls(); len(toolCalls) > 0 {
 			contents := make([]nexa_sdk.VlmContent, 0, len(toolCalls))
 			for _, tc := range toolCalls {
@@ -442,7 +427,6 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 			continue
 		}
 
-		// tool call response message
 		if toolResp := msg.GetToolCallID(); toolResp != nil {
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role: nexa_sdk.VlmRole(*msg.GetRole()),
@@ -456,9 +440,6 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 
 		switch content := msg.GetContent().AsAny().(type) {
 		case *string:
-			if *msg.GetRole() == "system" {
-				systemPrompt += *content
-			}
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role: nexa_sdk.VlmRole(*msg.GetRole()),
 				Contents: []nexa_sdk.VlmContent{
@@ -468,32 +449,22 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 
 		case *[]openai.ChatCompletionContentPartTextParam:
 			contents := make([]nexa_sdk.VlmContent, 0, len(*content))
-
 			for _, ct := range *content {
-				if *msg.GetRole() == "system" {
-					systemPrompt += ct.Text
-				}
 				contents = append(contents, nexa_sdk.VlmContent{
 					Type: nexa_sdk.VlmContentTypeText,
 					Text: ct.Text,
 				})
 			}
-
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role:     nexa_sdk.VlmRole(*msg.GetRole()),
 				Contents: contents,
 			})
 
 		case *[]openai.ChatCompletionContentPartUnionParam:
 			contents := make([]nexa_sdk.VlmContent, 0, len(*content))
-
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					contents = append(contents, nexa_sdk.VlmContent{
 						Type: nexa_sdk.VlmContentTypeText,
 						Text: *ct.GetText(),
@@ -528,22 +499,16 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 					return
 				}
 			}
-
 			messages = append(messages, nexa_sdk.VlmChatMessage{
 				Role:     nexa_sdk.VlmRole(*msg.GetRole()),
 				Contents: contents,
 			})
 
 		case *[]openai.ChatCompletionAssistantMessageParamContentArrayOfContentPartUnion:
 			contents := make([]nexa_sdk.VlmContent, 0, len(*content))
-
 			for _, ct := range *content {
 				switch *ct.GetType() {
 				case "text":
-					// NOTE: patch for npu
-					if *msg.GetRole() == "system" {
-						systemPrompt += *ct.GetText()
-					}
 					contents = append(contents, nexa_sdk.VlmContent{
 						Type: nexa_sdk.VlmContentTypeText,
 						Text: *ct.GetText(),
@@ -577,10 +542,9 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 
 	samplerConfig := parseSamplerConfig(param)
 
-	// Get VLM instance
 	p, err := service.KeepAliveGet[nexa_sdk.VLM](
 		string(param.Model),
-		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl, SystemPrompt: systemPrompt},
+		types.ModelParam{NCtx: param.NCtx, NGpuLayers: param.Ngl},
 		c.GetHeader("Nexa-KeepCache") != "true",
 	)
 	if errors.Is(err, os.ErrNotExist) {
@@ -590,9 +554,7 @@ func chatCompletionsVLM(c *gin.Context, param ChatCompletionRequest) {
 		c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error(), "code": nexa_sdk.SDKErrorCode(err)})
 		return
 	}
-
-	// Empty request for warm up, just reset model state
-	if len(param.Messages) == 0 || (systemPrompt != "" && len(param.Messages) <= 1) {
+	if isWarmupRequest(param) {
 		c.JSON(http.StatusOK, nil)
 		return
 	}
diff --git a/runner/server/service/keepalive.go b/runner/server/service/keepalive.go
@@ -169,9 +169,8 @@ func keepAliveGet[T any](name string, param types.ModelParam, reset bool) (any,
 			ModelName: manifest.ModelName,
 			ModelPath: modelfile,
 			Config: nexa_sdk.ModelConfig{
-				NCtx:         param.NCtx,
-				NGpuLayers:   param.NGpuLayers,
-				SystemPrompt: param.SystemPrompt,
+				NCtx:       param.NCtx,
+				NGpuLayers: param.NGpuLayers,
 			},
 			PluginID: manifest.PluginId,
 			DeviceID: manifest.DeviceId,
@@ -191,9 +190,8 @@ func keepAliveGet[T any](name string, param types.ModelParam, reset bool) (any,
 			MmprojPath:    mmproj,
 			TokenizerPath: tokenizer,
 			Config: nexa_sdk.ModelConfig{
-				NCtx:         param.NCtx,
-				NGpuLayers:   param.NGpuLayers,
-				SystemPrompt: param.SystemPrompt,
+				NCtx:       param.NCtx,
+				NGpuLayers: param.NGpuLayers,
 			},
 			PluginID: manifest.PluginId,
 			DeviceID: manifest.DeviceId,

Original file line number	Diff line number	Diff line change
`@@ -86,9 +86,6 @@ func (m ModelManifest) GetSize() int64 {`
`86`	`86`	`type ModelParam struct {`
`87`	`87`	`NCtx int32`
`88`	`88`	`NGpuLayers int32`
`89`		`-`
`90`		`- // npu only`
`91`		`- SystemPrompt string`
`92`	`89`	`}`
`93`	`90`
`94`	`91`	`type DownloadInfo struct {`
Original file line number	Diff line number	Diff line change
`@@ -277,5 +277,4 @@ type ModelConfig struct {`
`277`	`277`	`NGpuLayers int32`
`278`	`278`	`ChatTemplatePath string`
`279`	`279`	`ChatTemplateContent string`
`280`		`- SystemPrompt string`
`281`	`280`	`}`
Original file line number	Diff line number	Diff line change
`@@ -82,10 +82,6 @@ func (lci LlmCreateInput) toCPtr() *C.ml_LlmCreateInput {`
`82`	`82`	`if lci.Config.ChatTemplateContent != "" {`
`83`	`83`	`cPtr.config.chat_template_content = C.CString(lci.Config.ChatTemplateContent)`
`84`	`84`	`}`
`85`		`- // Add system prompt support`
`86`		`- if lci.Config.SystemPrompt != "" {`
`87`		`- cPtr.config.system_prompt = C.CString(lci.Config.SystemPrompt)`
`88`		`- }`
`89`	`85`
`90`	`86`	`return cPtr`
`91`	`87`	`}`
`@@ -114,9 +110,6 @@ func freeLlmCreateInput(cPtr *C.ml_LlmCreateInput) {`
`114`	`110`	`if cPtr.config.chat_template_content != nil {`
`115`	`111`	`C.free(unsafe.Pointer(cPtr.config.chat_template_content))`
`116`	`112`	`}`
`117`		`- if cPtr.config.system_prompt != nil {`
`118`		`- C.free(unsafe.Pointer(cPtr.config.system_prompt))`
`119`		`- }`
`120`	`113`
`121`	`114`	`// Free the main structure`
`122`	`115`	`C.free(unsafe.Pointer(cPtr))`
Original file line number	Diff line number	Diff line change
`@@ -76,10 +76,6 @@ func (vci VlmCreateInput) toCPtr() *C.ml_VlmCreateInput {`
`76`	`76`	`if vci.Config.ChatTemplateContent != "" {`
`77`	`77`	`cPtr.config.chat_template_content = C.CString(vci.Config.ChatTemplateContent)`
`78`	`78`	`}`
`79`		`- // Add system prompt support`
`80`		`- if vci.Config.SystemPrompt != "" {`
`81`		`- cPtr.config.system_prompt = C.CString(vci.Config.SystemPrompt)`
`82`		`- }`
`83`	`79`
`84`	`80`	`return cPtr`
`85`	`81`	`}`
`@@ -110,9 +106,6 @@ func freeVlmCreateInput(cPtr *C.ml_VlmCreateInput) {`
`110`	`106`	`if cPtr.config.chat_template_content != nil {`
`111`	`107`	`C.free(unsafe.Pointer(cPtr.config.chat_template_content))`
`112`	`108`	`}`
`113`		`- if cPtr.config.system_prompt != nil {`
`114`		`- C.free(unsafe.Pointer(cPtr.config.system_prompt))`
`115`		`- }`
`116`	`109`
`117`	`110`	`// Free the main structure`
`118`	`111`	`C.free(unsafe.Pointer(cPtr))`