diff --git a/docs/guides/multimodal-content.md b/docs/guides/multimodal-content.md
index 65100420d..2b8e32bd2 100644
--- a/docs/guides/multimodal-content.md
+++ b/docs/guides/multimodal-content.md
@@ -26,12 +26,13 @@ const textPart: TextPart = {
content: 'What do you see in this image?'
}
-// Image from base64 data
+// Image from base64 data (mimeType is required for data sources)
const imagePart: ImagePart = {
type: 'image',
source: {
type: 'data',
- value: 'base64EncodedImageData...'
+ value: 'base64EncodedImageData...',
+ mimeType: 'image/jpeg' // Required for data sources
},
metadata: {
// Provider-specific metadata
@@ -39,12 +40,13 @@ const imagePart: ImagePart = {
}
}
-// Image from URL
+// Image from URL (mimeType is optional for URL sources)
const imageUrlPart: ImagePart = {
type: 'image',
source: {
type: 'url',
- value: 'https://example.com/image.jpg'
+ value: 'https://example.com/image.jpg',
+ mimeType: 'image/jpeg' // Optional hint for URL sources
}
}
```
@@ -95,7 +97,7 @@ const message = {
{ type: 'text' , content: 'Describe this image' },
{
type: 'image' ,
- source: { type: 'data' , value: imageBase64 },
+ source: { type: 'data' , value: imageBase64, mimeType: 'image/jpeg' },
metadata: { detail: 'high' } // 'auto' | 'low' | 'high'
}
]
@@ -115,15 +117,14 @@ import { anthropicText } from '@tanstack/ai-anthropic'
const adapter = anthropicText()
-// Image with media type
+// Image with mimeType in source
const imageMessage = {
role: 'user' ,
content: [
{ type: 'text' , content: 'What do you see?' },
{
type: 'image' ,
- source: { type: 'data' , value: imageBase64 },
- metadata: { media_type: 'image/jpeg' }
+ source: { type: 'data' , value: imageBase64, mimeType: 'image/jpeg' }
}
]
}
@@ -135,7 +136,7 @@ const docMessage = {
{ type: 'text', content: 'Summarize this document' },
{
type: 'document',
- source: { type: 'data', value: pdfBase64 }
+ source: { type: 'data', value: pdfBase64, mimeType: 'application/pdf' }
}
]
}
@@ -154,15 +155,14 @@ import { geminiText } from '@tanstack/ai-gemini'
const adapter = geminiText()
-// Image with mimeType
+// Image with mimeType in source
const message = {
role: 'user',
content: [
{ type: 'text', content: 'Analyze this image' },
{
type: 'image',
- source: { type: 'data', value: imageBase64 },
- metadata: { mimeType: 'image/png' }
+ source: { type: 'data', value: imageBase64, mimeType: 'image/png' }
}
]
}
@@ -188,7 +188,7 @@ const message = {
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
- source: { type: 'data', value: imageBase64 }
+ source: { type: 'data', value: imageBase64, mimeType: 'image/jpeg' }
}
]
}
@@ -202,28 +202,39 @@ Content can be provided as either inline data or a URL:
### Data (Base64)
-Use `type: 'data'` for inline base64-encoded content:
+Use `type: 'data'` for inline base64-encoded content. **The `mimeType` field is required** to ensure providers receive proper content type information:
```typescript
const imagePart = {
type: 'image',
source: {
type: 'data',
- value: 'iVBORw0KGgoAAAANSUhEUgAAAAUA...' // Base64 string
+ value: 'iVBORw0KGgoAAAANSUhEUgAAAAUA...', // Base64 string
+ mimeType: 'image/png' // Required for data sources
+ }
+}
+
+const audioPart = {
+ type: 'audio',
+ source: {
+ type: 'data',
+ value: 'base64AudioData...',
+ mimeType: 'audio/mp3' // Required for data sources
}
}
```
### URL
-Use `type: 'url'` for content hosted at a URL:
+Use `type: 'url'` for content hosted at a URL. The `mimeType` field is **optional** as providers can often infer it from the URL or response headers:
```typescript
const imagePart = {
type: 'image' ,
source: {
type: 'url' ,
- value: 'https://example.com/image.jpg'
+ value: 'https://example.com/image.jpg',
+ mimeType: 'image/jpeg' // Optional hint
}
}
```
@@ -315,3 +326,163 @@ const stream = chat({
3. **Check model support**: Not all models support all modalities. Verify the model you're using supports the content types you want to send.
4. **Handle errors gracefully**: When a model doesn't support a particular modality, it may throw an error. Handle these cases in your application.
+
+## Client-Side Multimodal Messages
+
+When using the `ChatClient` from `@tanstack/ai-client`, you can send multimodal messages directly from your UI using the `sendMessage` method.
+
+### Basic Usage
+
+The `sendMessage` method accepts either a simple string or a `MultimodalContent` object:
+
+```typescript
+import { ChatClient, fetchServerSentEvents } from '@tanstack/ai-client'
+
+const client = new ChatClient({
+ connection: fetchServerSentEvents('/api/chat'),
+})
+
+// Simple text message
+await client.sendMessage('Hello!')
+
+// Multimodal message with image
+await client.sendMessage({
+ content: [
+ { type: 'text', content: 'What is in this image?' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/photo.jpg' }
+ }
+ ]
+})
+```
+
+### Custom Message ID
+
+You can provide a custom ID for the message:
+
+```typescript
+await client.sendMessage({
+ content: 'Hello!',
+ id: 'custom-message-id-123'
+})
+```
+
+### Per-Message Body Parameters
+
+The second parameter allows you to pass additional body parameters for that specific request. These are shallow-merged with the client's base body configuration, with per-message parameters taking priority:
+
+```typescript
+const client = new ChatClient({
+ connection: fetchServerSentEvents('/api/chat'),
+ body: { model: 'gpt-5' }, // Base body params
+})
+
+// Override model for this specific message
+await client.sendMessage('Analyze this complex problem', {
+ model: 'gpt-5',
+ temperature: 0.2,
+})
+
+
+```
+
+### React Example
+
+Here's how to use multimodal messages in a React component:
+
+```tsx
+import { useChat } from '@tanstack/ai-react'
+import { fetchServerSentEvents } from '@tanstack/ai-client'
+import { useState } from 'react'
+
+function ChatWithImages() {
+ const [imageUrl, setImageUrl] = useState('')
+ const { sendMessage, messages } = useChat({
+ connection: fetchServerSentEvents('/api/chat'),
+ })
+
+ const handleSendWithImage = () => {
+ if (imageUrl) {
+ sendMessage({
+ content: [
+ { type: 'text', content: 'What do you see in this image?' },
+ { type: 'image', source: { type: 'url', value: imageUrl } }
+ ]
+ })
+ }
+ }
+
+ return (
+
+ setImageUrl(e.target.value)}
+ />
+
+
+ )
+}
+```
+
+### File Upload Example
+
+Here's how to handle file uploads and send them as multimodal content:
+
+```tsx
+import { useChat } from '@tanstack/ai-react'
+import { fetchServerSentEvents } from '@tanstack/ai-client'
+
+function ChatWithFileUpload() {
+ const { sendMessage } = useChat({
+ connection: fetchServerSentEvents('/api/chat'),
+ })
+
+ const handleFileUpload = async (file: File) => {
+ // Convert file to base64
+ const base64 = await new Promise((resolve) => {
+ const reader = new FileReader()
+ reader.onload = () => {
+ const result = reader.result as string
+ // Remove data URL prefix (e.g., "data:image/png;base64,")
+ resolve(result.split(',')[1])
+ }
+ reader.readAsDataURL(file)
+ })
+
+ // Determine content type based on file type
+ const type = file.type.startsWith('image/')
+ ? 'image'
+ : file.type.startsWith('audio/')
+ ? 'audio'
+ : file.type.startsWith('video/')
+ ? 'video'
+ : 'document'
+
+ await sendMessage({
+ content: [
+ { type: 'text', content: `Please analyze this ${type}` },
+ {
+ type,
+ source: { type: 'data', value: base64 },
+ metadata: { mimeType: file.type }
+ }
+ ]
+ })
+ }
+
+ return (
+ {
+ const file = e.target.files?.[0]
+ if (file) handleFileUpload(file)
+ }}
+ />
+ )
+}
+```
+
diff --git a/examples/ts-react-chat/src/routes/index.tsx b/examples/ts-react-chat/src/routes/index.tsx
index c9436c7a9..3463c6610 100644
--- a/examples/ts-react-chat/src/routes/index.tsx
+++ b/examples/ts-react-chat/src/routes/index.tsx
@@ -1,6 +1,6 @@
import { useEffect, useMemo, useRef, useState } from 'react'
import { createFileRoute } from '@tanstack/react-router'
-import { Send, Square } from 'lucide-react'
+import { ImagePlus, Send, Square, X } from 'lucide-react'
import ReactMarkdown from 'react-markdown'
import rehypeRaw from 'rehype-raw'
import rehypeSanitize from 'rehype-sanitize'
@@ -10,6 +10,7 @@ import { fetchServerSentEvents, useChat } from '@tanstack/ai-react'
import { clientTools } from '@tanstack/ai-client'
import { ThinkingPart } from '@tanstack/ai-react-ui'
import type { UIMessage } from '@tanstack/ai-react'
+import type { ContentPart } from '@tanstack/ai'
import type { ModelOption } from '@/lib/model-selection'
import GuitarRecommendation from '@/components/example-GuitarRecommendation'
import {
@@ -20,6 +21,13 @@ import {
} from '@/lib/guitar-tools'
import { DEFAULT_MODEL_OPTION, MODEL_OPTIONS } from '@/lib/model-selection'
+/**
+ * Generate a random message ID
+ */
+function generateMessageId(): string {
+ return `msg-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`
+}
+
const getPersonalGuitarPreferenceToolClient =
getPersonalGuitarPreferenceToolDef.client(() => ({ preference: 'acoustic' }))
@@ -148,6 +156,23 @@ function Messages({
)
}
+ // Render image parts
+ if (part.type === 'image') {
+ const imageUrl =
+ part.source.type === 'url'
+ ? part.source.value
+ : `data:image/png;base64,${part.source.value}`
+ return (
+
+

+
+ )
+ }
+
// Approval UI
if (
part.type === 'tool-call' &&
@@ -226,6 +251,10 @@ function Messages({
function ChatPage() {
const [selectedModel, setSelectedModel] =
useState(DEFAULT_MODEL_OPTION)
+ const [attachedImages, setAttachedImages] = useState<
+ Array<{ id: string; base64: string; mimeType: string; preview: string }>
+ >([])
+ const fileInputRef = useRef(null)
const body = useMemo(
() => ({
@@ -243,6 +272,103 @@ function ChatPage() {
})
const [input, setInput] = useState('')
+ /**
+ * Handle file selection for image attachment
+ */
+ const handleFileSelect = async (e: React.ChangeEvent) => {
+ const files = e.target.files
+ if (!files || files.length === 0) return
+
+ const newImages: Array<{
+ id: string
+ base64: string
+ mimeType: string
+ preview: string
+ }> = []
+
+ for (const file of Array.from(files)) {
+ if (!file.type.startsWith('image/')) continue
+
+ const base64 = await new Promise((resolve) => {
+ const reader = new FileReader()
+ reader.onload = () => {
+ const result = reader.result as string
+ // Remove data URL prefix (e.g., "data:image/png;base64,")
+ resolve(result.split(',')[1])
+ }
+ reader.readAsDataURL(file)
+ })
+
+ const preview = URL.createObjectURL(file)
+ newImages.push({
+ id: generateMessageId(),
+ base64,
+ mimeType: file.type, // Capture the actual mime type
+ preview,
+ })
+ }
+
+ setAttachedImages((prev) => [...prev, ...newImages])
+
+ // Reset the file input
+ if (fileInputRef.current) {
+ fileInputRef.current.value = ''
+ }
+ }
+
+ /**
+ * Remove an attached image
+ */
+ const removeImage = (id: string) => {
+ setAttachedImages((prev) => {
+ const image = prev.find((img) => img.id === id)
+ if (image) {
+ URL.revokeObjectURL(image.preview)
+ }
+ return prev.filter((img) => img.id !== id)
+ })
+ }
+
+ /**
+ * Send message with optional image attachments
+ */
+ const handleSendMessage = () => {
+ if (!input.trim() && attachedImages.length === 0) return
+
+ if (attachedImages.length > 0) {
+ // Build multimodal content array
+ const contentParts: Array = []
+
+ // Add text if present
+ if (input.trim()) {
+ contentParts.push({ type: 'text', content: input.trim() })
+ }
+
+ // Add images with mime type metadata
+ for (const img of attachedImages) {
+ contentParts.push({
+ type: 'image',
+ source: { type: 'data', value: img.base64, mimeType: img.mimeType },
+ })
+ }
+
+ // Send with custom message ID
+ sendMessage({
+ content: contentParts,
+ id: generateMessageId(),
+ })
+
+ // Clean up image previews
+ attachedImages.forEach((img) => URL.revokeObjectURL(img.preview))
+ setAttachedImages([])
+ } else {
+ // Simple text message
+ sendMessage(input.trim())
+ }
+
+ setInput('')
+ }
+
return (
{/* Chat */}
@@ -295,41 +421,89 @@ function ChatPage() {
)}
-
diff --git a/packages/typescript/ai-anthropic/src/adapters/text.ts b/packages/typescript/ai-anthropic/src/adapters/text.ts
index 5b1896b2d..a1dca82c0 100644
--- a/packages/typescript/ai-anthropic/src/adapters/text.ts
+++ b/packages/typescript/ai-anthropic/src/adapters/text.ts
@@ -313,17 +313,20 @@ export class AnthropicTextAdapter<
? {
type: 'base64',
data: part.source.value,
- media_type: metadata?.mediaType ?? 'image/jpeg',
+ media_type: part.source.mimeType as
+ | 'image/jpeg'
+ | 'image/png'
+ | 'image/gif'
+ | 'image/webp',
}
: {
type: 'url',
url: part.source.value,
}
- const { mediaType: _mediaType, ...meta } = metadata || {}
return {
type: 'image',
source: imageSource,
- ...meta,
+ ...metadata,
}
}
case 'document': {
@@ -333,7 +336,7 @@ export class AnthropicTextAdapter<
? {
type: 'base64',
data: part.source.value,
- media_type: 'application/pdf',
+ media_type: part.source.mimeType as 'application/pdf',
}
: {
type: 'url',
diff --git a/packages/typescript/ai-anthropic/src/index.ts b/packages/typescript/ai-anthropic/src/index.ts
index 4bca2e4b3..b0ff0750b 100644
--- a/packages/typescript/ai-anthropic/src/index.ts
+++ b/packages/typescript/ai-anthropic/src/index.ts
@@ -34,8 +34,6 @@ export type {
AnthropicDocumentMetadata,
AnthropicAudioMetadata,
AnthropicVideoMetadata,
- AnthropicImageMediaType,
- AnthropicDocumentMediaType,
AnthropicMessageMetadataByModality,
} from './message-types'
diff --git a/packages/typescript/ai-client/src/chat-client.ts b/packages/typescript/ai-client/src/chat-client.ts
index 27604beb5..f6ead390d 100644
--- a/packages/typescript/ai-client/src/chat-client.ts
+++ b/packages/typescript/ai-client/src/chat-client.ts
@@ -4,13 +4,19 @@ import {
normalizeToUIMessage,
} from '@tanstack/ai'
import { DefaultChatClientEventEmitter } from './events'
-import type { AnyClientTool, ModelMessage, StreamChunk } from '@tanstack/ai'
+import type {
+ AnyClientTool,
+ ContentPart,
+ ModelMessage,
+ StreamChunk,
+} from '@tanstack/ai'
import type { ConnectionAdapter } from './connection-adapters'
import type { ChatClientEventEmitter } from './events'
import type {
ChatClientOptions,
ChatClientState,
MessagePart,
+ MultimodalContent,
ToolCallPart,
UIMessage,
} from './types'
@@ -20,6 +26,7 @@ export class ChatClient {
private connection: ConnectionAdapter
private uniqueId: string
private body: Record = {}
+ private pendingMessageBody: Record | undefined = undefined
private isLoading = false
private error: Error | undefined = undefined
private status: ChatClientState = 'ready'
@@ -262,20 +269,82 @@ export class ChatClient {
}
/**
- * Send a message and stream the response
+ * Send a message and stream the response.
+ * Supports both simple string content and multimodal content (images, audio, video, documents).
+ *
+ * @param content - The message content. Can be:
+ * - A simple string for text-only messages
+ * - A MultimodalContent object with content array and optional custom ID
+ * @param body - Optional body parameters to merge with the client's base body for this request.
+ * Uses shallow merge with per-message body taking priority.
+ *
+ * @example
+ * ```ts
+ * // Simple text message
+ * await client.sendMessage('Hello!')
+ *
+ * // Text message with custom body params
+ * await client.sendMessage('Hello!', { temperature: 0.7 })
+ *
+ * // Multimodal message with image
+ * await client.sendMessage({
+ * content: [
+ * { type: 'text', content: 'What is in this image?' },
+ * { type: 'image', source: { type: 'url', value: 'https://example.com/photo.jpg' } }
+ * ]
+ * })
+ *
+ * // Multimodal message with custom ID and body params
+ * await client.sendMessage(
+ * {
+ * content: [
+ * { type: 'text', content: 'Describe this audio' },
+ * { type: 'audio', source: { type: 'data', value: 'base64...' } }
+ * ],
+ * id: 'custom-message-id'
+ * },
+ * { model: 'gpt-4-audio' }
+ * )
+ * ```
*/
- async sendMessage(content: string): Promise {
- if (!content.trim() || this.isLoading) {
+ async sendMessage(
+ content: string | MultimodalContent,
+ body?: Record,
+ ): Promise {
+ const emptyMessage = typeof content === 'string' && !content.trim()
+ if (emptyMessage || this.isLoading) {
return
}
+ // Normalize input to extract content, id, and validate
+ const normalizedContent = this.normalizeMessageInput(content)
+
+ // Store the per-message body for use in streamResponse
+ this.pendingMessageBody = body
// Add user message via processor
- const userMessage = this.processor.addUserMessage(content.trim())
- this.events.messageSent(userMessage.id, content.trim())
+ const userMessage = this.processor.addUserMessage(
+ normalizedContent.content,
+ normalizedContent.id,
+ )
+ this.events.messageSent(userMessage.id, normalizedContent.content)
await this.streamResponse()
}
+ /**
+ * Normalize the message input to extract content and optional id.
+ * Trims string content automatically.
+ */
+ private normalizeMessageInput(input: string | MultimodalContent): {
+ content: string | Array
+ id?: string
+ } {
+ if (typeof input === 'string') {
+ return { content: input.trim() }
+ }
+ return { content: input.content, id: input.id }
+ }
+
/**
* Append a message and stream the response
*/
@@ -317,16 +386,21 @@ export class ChatClient {
// Call onResponse callback
await this.callbacksRef.current.onResponse()
- // Include conversationId in the body for server-side event correlation
- const bodyWithConversationId = {
+ // Merge body: base body + per-message body (per-message takes priority)
+ // Include conversationId for server-side event correlation
+ const mergedBody = {
...this.body,
+ ...this.pendingMessageBody,
conversationId: this.uniqueId,
}
+ // Clear the pending message body after use
+ this.pendingMessageBody = undefined
+
// Connect and stream
const stream = this.connection.connect(
modelMessages,
- bodyWithConversationId,
+ mergedBody,
this.abortController.signal,
)
@@ -343,6 +417,7 @@ export class ChatClient {
} finally {
this.abortController = null
this.setIsLoading(false)
+ this.pendingMessageBody = undefined // Ensure it's cleared even on error
// Drain any actions that were queued while the stream was in progress
await this.drainPostStreamActions()
diff --git a/packages/typescript/ai-client/src/events.ts b/packages/typescript/ai-client/src/events.ts
index 33532f411..0713625bf 100644
--- a/packages/typescript/ai-client/src/events.ts
+++ b/packages/typescript/ai-client/src/events.ts
@@ -1,4 +1,5 @@
import { aiEventClient } from '@tanstack/ai/event-client'
+import type { ContentPart } from '@tanstack/ai'
import type { UIMessage } from './types'
/**
@@ -137,19 +138,36 @@ export abstract class ChatClientEventEmitter {
}
/**
- * Emit message sent event
+ * Emit message sent event.
+ * Supports both simple string content and multimodal content arrays.
+ *
+ * @param messageId - The ID of the sent message
+ * @param content - The message content (string or array of ContentPart for multimodal)
*/
- messageSent(messageId: string, content: string): void {
+ messageSent(messageId: string, content: string | Array): void {
+ // For text content, extract it; for multimodal, provide the array
+ const textContent =
+ typeof content === 'string'
+ ? content
+ : content
+ .filter((part) => part.type === 'text')
+ .map((part) => (part as { type: 'text'; content: string }).content)
+ .join('')
+
this.emitEvent('text:message:created', {
messageId,
role: 'user',
- content,
+ content: textContent,
+ // Include full content for multimodal messages
+ ...(Array.isArray(content) && { parts: content }),
})
this.emitEvent('text:message:user', {
messageId,
role: 'user',
- content,
+ content: textContent,
+ // Include full content for multimodal messages
+ ...(Array.isArray(content) && { parts: content }),
})
}
@@ -161,7 +179,6 @@ export abstract class ChatClientEventEmitter {
fromMessageIndex,
})
}
-
/**
* Emit stopped event
*/
diff --git a/packages/typescript/ai-client/src/index.ts b/packages/typescript/ai-client/src/index.ts
index b12c7c2ab..b279605d1 100644
--- a/packages/typescript/ai-client/src/index.ts
+++ b/packages/typescript/ai-client/src/index.ts
@@ -12,6 +12,8 @@ export type {
ChatRequestBody,
InferChatMessages,
ChatClientState,
+ // Multimodal content input type
+ MultimodalContent,
} from './types'
export { clientTools, createChatClientOptions } from './types'
export type {
diff --git a/packages/typescript/ai-client/src/types.ts b/packages/typescript/ai-client/src/types.ts
index 8aff9e4d0..985725481 100644
--- a/packages/typescript/ai-client/src/types.ts
+++ b/packages/typescript/ai-client/src/types.ts
@@ -1,10 +1,15 @@
import type {
AnyClientTool,
+ AudioPart,
ChunkStrategy,
+ ContentPart,
+ DocumentPart,
+ ImagePart,
InferToolInput,
InferToolOutput,
ModelMessage,
StreamChunk,
+ VideoPart,
} from '@tanstack/ai'
import type { ConnectionAdapter } from './connection-adapters'
@@ -31,6 +36,35 @@ export type ToolResultState =
*/
export type ChatClientState = 'ready' | 'submitted' | 'streaming' | 'error'
+/**
+ * Multimodal content input for sending messages with rich media.
+ * Allows sending text, images, audio, video, and documents to the LLM.
+ *
+ * @example
+ * ```ts
+ * // Send an image with a question
+ * client.sendMessage({
+ * content: [
+ * { type: 'text', content: 'What is in this image?' },
+ * { type: 'image', source: { type: 'url', value: 'https://example.com/photo.jpg' } }
+ * ],
+ * id: 'custom-message-id' // optional
+ * })
+ * ```
+ */
+export interface MultimodalContent {
+ /**
+ * The content of the message.
+ * Can be a simple string or an array of content parts for multimodal messages.
+ */
+ content: string | Array
+ /**
+ * Optional custom ID for the message.
+ * If not provided, a unique ID will be generated.
+ */
+ id?: string
+}
+
/**
* Message parts - building blocks of UIMessage
*/
@@ -121,6 +155,10 @@ export interface ThinkingPart {
export type MessagePart = any> =
| TextPart
+ | ImagePart
+ | AudioPart
+ | VideoPart
+ | DocumentPart
| ToolCallPart
| ToolResultPart
| ThinkingPart
diff --git a/packages/typescript/ai-client/tests/chat-client.test.ts b/packages/typescript/ai-client/tests/chat-client.test.ts
index d632becbf..279603783 100644
--- a/packages/typescript/ai-client/tests/chat-client.test.ts
+++ b/packages/typescript/ai-client/tests/chat-client.test.ts
@@ -1,12 +1,12 @@
import { describe, expect, it, vi } from 'vitest'
import { ChatClient } from '../src/chat-client'
-import type { UIMessage } from '../src/types'
import {
createMockConnectionAdapter,
createTextChunks,
createThinkingChunks,
createToolCallChunks,
} from './test-utils'
+import type { UIMessage } from '../src/types'
describe('ChatClient', () => {
describe('constructor', () => {
@@ -618,4 +618,287 @@ describe('ChatClient', () => {
expect(thinkingCalls.length).toBeGreaterThan(0)
})
})
+
+ describe('multimodal sendMessage', () => {
+ it('should send a multimodal message with image content', async () => {
+ const chunks = createTextChunks('I see a cat in the image')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: [
+ { type: 'text', content: 'What is in this image?' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ },
+ ],
+ })
+
+ const messages = client.getMessages()
+ expect(messages.length).toBeGreaterThan(0)
+ expect(messages[0]?.role).toBe('user')
+ expect(messages[0]?.parts.length).toBe(2)
+ expect(messages[0]?.parts[0]).toEqual({
+ type: 'text',
+ content: 'What is in this image?',
+ })
+ expect(messages[0]?.parts[1]).toEqual({
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ })
+ })
+
+ it('should send a multimodal message with audio content', async () => {
+ const chunks = createTextChunks('The audio says hello')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: [
+ { type: 'text', content: 'Transcribe this audio' },
+ {
+ type: 'audio',
+ source: {
+ type: 'data',
+ value: 'base64AudioData',
+ mimeType: 'audio/mp3',
+ },
+ },
+ ],
+ })
+
+ const messages = client.getMessages()
+ expect(messages[0]?.parts[1]).toEqual({
+ type: 'audio',
+ source: {
+ type: 'data',
+ value: 'base64AudioData',
+ mimeType: 'audio/mp3',
+ },
+ })
+ })
+
+ it('should send a multimodal message with video content', async () => {
+ const chunks = createTextChunks('The video shows a sunset')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: [
+ { type: 'text', content: 'Describe this video' },
+ {
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/video.mp4' },
+ },
+ ],
+ })
+
+ const messages = client.getMessages()
+ expect(messages[0]?.parts[1]).toEqual({
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/video.mp4' },
+ })
+ })
+
+ it('should send a multimodal message with document content', async () => {
+ const chunks = createTextChunks('The document discusses AI')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: [
+ { type: 'text', content: 'Summarize this PDF' },
+ {
+ type: 'document',
+ source: {
+ type: 'data',
+ value: 'base64PdfData',
+ mimeType: 'application/pdf',
+ },
+ },
+ ],
+ })
+
+ const messages = client.getMessages()
+ expect(messages[0]?.parts[1]).toEqual({
+ type: 'document',
+ source: {
+ type: 'data',
+ value: 'base64PdfData',
+ mimeType: 'application/pdf',
+ },
+ })
+ })
+
+ it('should use custom message id when provided', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: 'Hello',
+ id: 'custom-message-id-123',
+ })
+
+ const messages = client.getMessages()
+ expect(messages[0]?.id).toBe('custom-message-id-123')
+ })
+
+ it('should generate message id when not provided', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: 'Hello',
+ })
+
+ const messages = client.getMessages()
+ expect(messages[0]?.id).toMatch(/^msg-/)
+ })
+
+ it('should allow empty content array', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: [],
+ })
+
+ const messages = client.getMessages()
+ expect(messages.length).toBeGreaterThan(0)
+ expect(messages[0]?.parts).toEqual([])
+ })
+
+ it('should send string content as simple text message', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: 'Hello world',
+ })
+
+ const messages = client.getMessages()
+ expect(messages[0]?.parts).toEqual([
+ { type: 'text', content: 'Hello world' },
+ ])
+ })
+
+ it('should merge per-message body with base body', async () => {
+ const chunks = createTextChunks('Response')
+ let capturedData: Record | undefined
+ const adapter = createMockConnectionAdapter({
+ chunks,
+ onConnect: (_messages, data) => {
+ capturedData = data
+ },
+ })
+
+ const client = new ChatClient({
+ connection: adapter,
+ body: { model: 'gpt-4', temperature: 0.7 },
+ })
+
+ await client.sendMessage('Hello', {
+ model: 'gpt-4-turbo',
+ maxTokens: 100,
+ })
+
+ // Per-message body should override base body
+ expect(capturedData?.model).toBe('gpt-4-turbo')
+ expect(capturedData?.temperature).toBe(0.7) // From base body
+ expect(capturedData?.maxTokens).toBe(100) // From per-message body
+ })
+
+ it('should include conversationId in merged body', async () => {
+ const chunks = createTextChunks('Response')
+ let capturedData: Record | undefined
+ const adapter = createMockConnectionAdapter({
+ chunks,
+ onConnect: (_messages, data) => {
+ capturedData = data
+ },
+ })
+
+ const client = new ChatClient({
+ connection: adapter,
+ id: 'my-conversation',
+ })
+
+ await client.sendMessage('Hello')
+
+ expect(capturedData?.conversationId).toBe('my-conversation')
+ })
+
+ it('should clear per-message body after request', async () => {
+ const chunks = createTextChunks('Response')
+ let capturedData: Record | undefined
+ const adapter = createMockConnectionAdapter({
+ chunks,
+ onConnect: (_messages, data) => {
+ capturedData = data
+ },
+ })
+
+ const client = new ChatClient({
+ connection: adapter,
+ body: { model: 'gpt-4' },
+ })
+
+ // First message with per-message body
+ await client.sendMessage('First', { temperature: 0.9 })
+ expect(capturedData?.temperature).toBe(0.9)
+
+ // Second message without per-message body should not have temperature
+ await client.sendMessage('Second')
+ expect(capturedData?.temperature).toBeUndefined()
+ expect(capturedData?.model).toBe('gpt-4')
+ })
+
+ it('should emit events with multimodal content', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+
+ const { aiEventClient } = await import('@tanstack/ai/event-client')
+ const emitSpy = vi.spyOn(aiEventClient, 'emit')
+ emitSpy.mockClear() // Clear any previous calls
+
+ const client = new ChatClient({ connection: adapter })
+
+ await client.sendMessage({
+ content: [
+ { type: 'text', content: 'What is this?' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/img.jpg' },
+ },
+ ],
+ })
+
+ // Find message created events for user role
+ const userMessageCreatedCalls = emitSpy.mock.calls.filter(
+ ([eventName, data]) =>
+ eventName === 'text:message:created' &&
+ (data as any)?.role === 'user',
+ )
+
+ // Should have at least one user message created event
+ expect(userMessageCreatedCalls.length).toBeGreaterThan(0)
+
+ // The event should include the text content extracted from multimodal content
+ const userMessageEvent = userMessageCreatedCalls[0]
+ expect((userMessageEvent?.[1] as any)?.content).toBe('What is this?')
+ })
+ })
})
diff --git a/packages/typescript/ai-devtools/src/store/ai-context.tsx b/packages/typescript/ai-devtools/src/store/ai-context.tsx
index bc43d7f32..4b852da15 100644
--- a/packages/typescript/ai-devtools/src/store/ai-context.tsx
+++ b/packages/typescript/ai-devtools/src/store/ai-context.tsx
@@ -1,10 +1,19 @@
import { batch, createContext, onCleanup, onMount, useContext } from 'solid-js'
import { createStore, produce } from 'solid-js/store'
import { aiEventClient } from '@tanstack/ai/event-client'
+import type { ContentPartSource } from '@tanstack/ai'
import type { ParentComponent } from 'solid-js'
interface MessagePart {
- type: 'text' | 'tool-call' | 'tool-result' | 'thinking'
+ type:
+ | 'text'
+ | 'tool-call'
+ | 'tool-result'
+ | 'thinking'
+ | 'image'
+ | 'audio'
+ | 'video'
+ | 'document'
content?: string
toolCallId?: string
toolName?: string
@@ -12,6 +21,9 @@ interface MessagePart {
state?: string
output?: unknown
error?: string
+ // Multimodal content fields
+ source?: ContentPartSource
+ metadata?: unknown
}
export interface ToolCall {
@@ -685,37 +697,58 @@ export const AIProvider: ParentComponent = (props) => {
(message) => message.id === messageId,
)
- const parts = e.payload.parts?.map((part) => {
- if (part.type === 'text') {
- return { type: 'text', content: part.content } satisfies MessagePart
- }
- if (part.type === 'tool-call') {
- return {
- type: 'tool-call',
- toolCallId: part.id,
- toolName: part.name,
- arguments: part.arguments,
- state: part.state,
- output: part.output,
- content: part.approval
- ? JSON.stringify(part.approval)
- : undefined,
- } satisfies MessagePart
- }
- if (part.type === 'tool-result') {
- return {
- type: 'tool-result',
- toolCallId: part.toolCallId,
- content: part.content,
- state: part.state,
- error: part.error,
- } satisfies MessagePart
- }
- return {
- type: 'thinking',
- content: part.content,
- } satisfies MessagePart
- })
+ const parts = e.payload.parts
+ ?.map((part): MessagePart | null => {
+ if (part.type === 'text') {
+ return { type: 'text', content: part.content }
+ }
+ if (part.type === 'tool-call') {
+ return {
+ type: 'tool-call',
+ toolCallId: part.id,
+ toolName: part.name,
+ arguments: part.arguments,
+ state: part.state,
+ output: part.output,
+ content: part.approval
+ ? JSON.stringify(part.approval)
+ : undefined,
+ }
+ }
+ if (part.type === 'tool-result') {
+ return {
+ type: 'tool-result',
+ toolCallId: part.toolCallId,
+ content: part.content,
+ state: part.state,
+ error: part.error,
+ }
+ }
+ if (part.type === 'thinking') {
+ return {
+ type: 'thinking',
+ content: part.content,
+ }
+ }
+ // Handle multimodal parts (image, audio, video, document)
+ // These have a source property instead of content
+ if (
+ part.type === 'image' ||
+ part.type === 'audio' ||
+ part.type === 'video' ||
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+ part.type === 'document'
+ ) {
+ return {
+ type: part.type,
+ source: part.source,
+ metadata: part.metadata,
+ }
+ }
+ // Fallback for any unknown part types - skip them
+ return null
+ })
+ .filter((part): part is MessagePart => part !== null)
const toolCalls = e.payload.toolCalls?.map((toolCall) => ({
id: toolCall.id,
diff --git a/packages/typescript/ai-gemini/src/adapters/text.ts b/packages/typescript/ai-gemini/src/adapters/text.ts
index 6210dadc4..031298c18 100644
--- a/packages/typescript/ai-gemini/src/adapters/text.ts
+++ b/packages/typescript/ai-gemini/src/adapters/text.ts
@@ -30,13 +30,7 @@ import type {
TextOptions,
} from '@tanstack/ai'
import type { ExternalTextProviderOptions } from '../text/text-provider-options'
-import type {
- GeminiAudioMetadata,
- GeminiDocumentMetadata,
- GeminiImageMetadata,
- GeminiMessageMetadataByModality,
- GeminiVideoMetadata,
-} from '../message-types'
+import type { GeminiMessageMetadataByModality } from '../message-types'
import type { GeminiClientConfig } from '../utils'
/**
@@ -483,20 +477,6 @@ export class GeminiTextAdapter<
}
private convertContentPartToGemini(part: ContentPart): Part {
- const getDefaultFileType = (
- part: 'image' | 'audio' | 'video' | 'document',
- ) => {
- switch (part) {
- case 'image':
- return 'image/jpeg'
- case 'audio':
- return 'audio/mp3'
- case 'video':
- return 'video/mp4'
- case 'document':
- return 'application/pdf'
- }
- }
switch (part.type) {
case 'text':
return { text: part.content }
@@ -504,24 +484,26 @@ export class GeminiTextAdapter<
case 'audio':
case 'video':
case 'document': {
- const metadata = part.metadata as
- | GeminiDocumentMetadata
- | GeminiImageMetadata
- | GeminiVideoMetadata
- | GeminiAudioMetadata
- | undefined
if (part.source.type === 'data') {
return {
inlineData: {
data: part.source.value,
- mimeType: metadata?.mimeType ?? getDefaultFileType(part.type),
+ mimeType: part.source.mimeType,
},
}
} else {
+ // For URL sources, use provided mimeType or fall back to reasonable defaults
+ const defaultMimeType = {
+ image: 'image/jpeg',
+ audio: 'audio/mp3',
+ video: 'video/mp4',
+ document: 'application/pdf',
+ }[part.type]
+
return {
fileData: {
fileUri: part.source.value,
- mimeType: metadata?.mimeType ?? getDefaultFileType(part.type),
+ mimeType: part.source.mimeType ?? defaultMimeType,
},
}
}
diff --git a/packages/typescript/ai-gemini/src/index.ts b/packages/typescript/ai-gemini/src/index.ts
index c60ce0756..ffea4c31c 100644
--- a/packages/typescript/ai-gemini/src/index.ts
+++ b/packages/typescript/ai-gemini/src/index.ts
@@ -79,9 +79,5 @@ export type {
GeminiAudioMetadata,
GeminiVideoMetadata,
GeminiDocumentMetadata,
- GeminiImageMimeType,
- GeminiAudioMimeType,
- GeminiVideoMimeType,
- GeminiDocumentMimeType,
GeminiMessageMetadataByModality,
} from './message-types'
diff --git a/packages/typescript/ai-grok/src/adapters/text.ts b/packages/typescript/ai-grok/src/adapters/text.ts
index f8703f7fc..c0204ab53 100644
--- a/packages/typescript/ai-grok/src/adapters/text.ts
+++ b/packages/typescript/ai-grok/src/adapters/text.ts
@@ -502,10 +502,16 @@ export class GrokTextAdapter<
parts.push({ type: 'text', text: part.content })
} else if (part.type === 'image') {
const imageMetadata = part.metadata as GrokImageMetadata | undefined
+ // For base64 data, construct a data URI using the mimeType from source
+ const imageValue = part.source.value
+ const imageUrl =
+ part.source.type === 'data' && !imageValue.startsWith('data:')
+ ? `data:${part.source.mimeType};base64,${imageValue}`
+ : imageValue
parts.push({
type: 'image_url',
image_url: {
- url: part.source.value,
+ url: imageUrl,
detail: imageMetadata?.detail || 'auto',
},
})
diff --git a/packages/typescript/ai-openai/src/adapters/text.ts b/packages/typescript/ai-openai/src/adapters/text.ts
index b367afcc0..1747ce4ec 100644
--- a/packages/typescript/ai-openai/src/adapters/text.ts
+++ b/packages/typescript/ai-openai/src/adapters/text.ts
@@ -813,10 +813,14 @@ export class OpenAITextAdapter<
detail: imageMetadata?.detail || 'auto',
}
}
- // For base64 data, construct a data URI
+ // For base64 data, construct a data URI using the mimeType from source
+ const imageValue = part.source.value
+ const imageUrl = imageValue.startsWith('data:')
+ ? imageValue
+ : `data:${part.source.mimeType};base64,${imageValue}`
return {
type: 'input_image',
- image_url: part.source.value,
+ image_url: imageUrl,
detail: imageMetadata?.detail || 'auto',
}
}
diff --git a/packages/typescript/ai-openrouter/src/adapters/text.ts b/packages/typescript/ai-openrouter/src/adapters/text.ts
index d3a7e6a53..387110b8b 100644
--- a/packages/typescript/ai-openrouter/src/adapters/text.ts
+++ b/packages/typescript/ai-openrouter/src/adapters/text.ts
@@ -590,10 +590,16 @@ export class OpenRouterTextAdapter<
break
case 'image': {
const meta = part.metadata as OpenRouterImageMetadata | undefined
+ // For base64 data, construct a data URI using the mimeType from source
+ const imageValue = part.source.value
+ const imageUrl =
+ part.source.type === 'data' && !imageValue.startsWith('data:')
+ ? `data:${part.source.mimeType};base64,${imageValue}`
+ : imageValue
parts.push({
type: 'image_url',
imageUrl: {
- url: part.source.value,
+ url: imageUrl,
detail: meta?.detail || 'auto',
},
})
diff --git a/packages/typescript/ai-preact/src/types.ts b/packages/typescript/ai-preact/src/types.ts
index 7679e08af..3b1a4a041 100644
--- a/packages/typescript/ai-preact/src/types.ts
+++ b/packages/typescript/ai-preact/src/types.ts
@@ -3,11 +3,12 @@ import type {
ChatClientOptions,
ChatClientState,
ChatRequestBody,
+ MultimodalContent,
UIMessage,
} from '@tanstack/ai-client'
// Re-export types from ai-client
-export type { ChatRequestBody, UIMessage }
+export type { ChatRequestBody, MultimodalContent, UIMessage }
/**
* Options for the useChat hook.
@@ -40,9 +41,10 @@ export interface UseChatReturn<
messages: Array>
/**
- * Send a message and get a response
+ * Send a message and get a response.
+ * Can be a simple string or multimodal content with images, audio, etc.
*/
- sendMessage: (content: string) => Promise
+ sendMessage: (content: string | MultimodalContent) => Promise
/**
* Append a message to the conversation
diff --git a/packages/typescript/ai-preact/src/use-chat.ts b/packages/typescript/ai-preact/src/use-chat.ts
index 336eb7cb8..cfa9340f4 100644
--- a/packages/typescript/ai-preact/src/use-chat.ts
+++ b/packages/typescript/ai-preact/src/use-chat.ts
@@ -10,7 +10,12 @@ import {
import type { ChatClientState } from '@tanstack/ai-client'
import type { AnyClientTool, ModelMessage } from '@tanstack/ai'
-import type { UIMessage, UseChatOptions, UseChatReturn } from './types'
+import type {
+ MultimodalContent,
+ UIMessage,
+ UseChatOptions,
+ UseChatReturn,
+} from './types'
export function useChat = any>(
options: UseChatOptions,
@@ -109,7 +114,7 @@ export function useChat = any>(
// are captured at client creation time. Changes to these callbacks require
// remounting the component or changing the connection to recreate the client.
const sendMessage = useCallback(
- async (content: string) => {
+ async (content: string | MultimodalContent) => {
await client.sendMessage(content)
},
[client],
diff --git a/packages/typescript/ai-react/src/types.ts b/packages/typescript/ai-react/src/types.ts
index 0bca98831..a960a1b8f 100644
--- a/packages/typescript/ai-react/src/types.ts
+++ b/packages/typescript/ai-react/src/types.ts
@@ -3,11 +3,12 @@ import type {
ChatClientOptions,
ChatClientState,
ChatRequestBody,
+ MultimodalContent,
UIMessage,
} from '@tanstack/ai-client'
// Re-export types from ai-client
-export type { ChatRequestBody, UIMessage }
+export type { ChatRequestBody, MultimodalContent, UIMessage }
/**
* Options for the useChat hook.
@@ -40,9 +41,10 @@ export interface UseChatReturn<
messages: Array>
/**
- * Send a message and get a response
+ * Send a message and get a response.
+ * Can be a simple string or multimodal content with images, audio, etc.
*/
- sendMessage: (content: string) => Promise
+ sendMessage: (content: string | MultimodalContent) => Promise
/**
* Append a message to the conversation
diff --git a/packages/typescript/ai-react/src/use-chat.ts b/packages/typescript/ai-react/src/use-chat.ts
index 509fd64f1..2cdc02d11 100644
--- a/packages/typescript/ai-react/src/use-chat.ts
+++ b/packages/typescript/ai-react/src/use-chat.ts
@@ -3,7 +3,12 @@ import { useCallback, useEffect, useId, useMemo, useRef, useState } from 'react'
import type { AnyClientTool, ModelMessage } from '@tanstack/ai'
import type { ChatClientState } from '@tanstack/ai-client'
-import type { UIMessage, UseChatOptions, UseChatReturn } from './types'
+import type {
+ MultimodalContent,
+ UIMessage,
+ UseChatOptions,
+ UseChatReturn,
+} from './types'
export function useChat = any>(
options: UseChatOptions,
@@ -109,7 +114,7 @@ export function useChat = any>(
// remounting the component or changing the connection to recreate the client.
const sendMessage = useCallback(
- async (content: string) => {
+ async (content: string | MultimodalContent) => {
await client.sendMessage(content)
},
[client],
diff --git a/packages/typescript/ai-react/tests/use-chat.test.ts b/packages/typescript/ai-react/tests/use-chat.test.ts
index 1eba78a7d..282c2ae5d 100644
--- a/packages/typescript/ai-react/tests/use-chat.test.ts
+++ b/packages/typescript/ai-react/tests/use-chat.test.ts
@@ -1242,4 +1242,326 @@ describe('useChat', () => {
})
})
})
+
+ describe('multimodal sendMessage', () => {
+ it('should send a multimodal message with image URL', async () => {
+ const chunks = createTextChunks('I see a cat in the image')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'What is in this image?' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage).toBeDefined()
+ expect(userMessage?.parts.length).toBe(2)
+ expect(userMessage?.parts[0]).toEqual({
+ type: 'text',
+ content: 'What is in this image?',
+ })
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ })
+ })
+
+ it('should send a multimodal message with image data and required mimeType', async () => {
+ const chunks = createTextChunks('I see a cat in the image')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'What is in this image?' },
+ {
+ type: 'image',
+ source: {
+ type: 'data',
+ value: 'base64ImageData',
+ mimeType: 'image/png',
+ },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'image',
+ source: {
+ type: 'data',
+ value: 'base64ImageData',
+ mimeType: 'image/png',
+ },
+ })
+ })
+
+ it('should send a multimodal message with audio data and required mimeType', async () => {
+ const chunks = createTextChunks('The audio says hello')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'Transcribe this audio' },
+ {
+ type: 'audio',
+ source: {
+ type: 'data',
+ value: 'base64AudioData',
+ mimeType: 'audio/mp3',
+ },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'audio',
+ source: {
+ type: 'data',
+ value: 'base64AudioData',
+ mimeType: 'audio/mp3',
+ },
+ })
+ })
+
+ it('should send a multimodal message with video URL', async () => {
+ const chunks = createTextChunks('The video shows a sunset')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'Describe this video' },
+ {
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/video.mp4' },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/video.mp4' },
+ })
+ })
+
+ it('should send a multimodal message with video data and required mimeType', async () => {
+ const chunks = createTextChunks('The video shows a sunset')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'Describe this video' },
+ {
+ type: 'video',
+ source: {
+ type: 'data',
+ value: 'base64VideoData',
+ mimeType: 'video/mp4',
+ },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'video',
+ source: {
+ type: 'data',
+ value: 'base64VideoData',
+ mimeType: 'video/mp4',
+ },
+ })
+ })
+
+ it('should send a multimodal message with document data and required mimeType', async () => {
+ const chunks = createTextChunks('The document discusses AI')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'Summarize this PDF' },
+ {
+ type: 'document',
+ source: {
+ type: 'data',
+ value: 'base64PdfData',
+ mimeType: 'application/pdf',
+ },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'document',
+ source: {
+ type: 'data',
+ value: 'base64PdfData',
+ mimeType: 'application/pdf',
+ },
+ })
+ })
+
+ it('should send a multimodal message with document URL', async () => {
+ const chunks = createTextChunks('The document discusses AI')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'Summarize this document' },
+ {
+ type: 'document',
+ source: {
+ type: 'url',
+ value: 'https://example.com/doc.pdf',
+ },
+ },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'document',
+ source: {
+ type: 'url',
+ value: 'https://example.com/doc.pdf',
+ },
+ })
+ })
+
+ it('should send complex multimodal message with multiple content parts', async () => {
+ const chunks = createTextChunks('I see multiple items')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [
+ { type: 'text', content: 'Compare these items' },
+ {
+ type: 'image',
+ source: {
+ type: 'data',
+ value: 'base64Image1',
+ mimeType: 'image/jpeg',
+ },
+ },
+ {
+ type: 'image',
+ source: {
+ type: 'url',
+ value: 'https://example.com/image2.png',
+ },
+ },
+ { type: 'text', content: 'Which one is better?' },
+ ],
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts.length).toBe(4)
+ expect(userMessage?.parts[0]).toEqual({
+ type: 'text',
+ content: 'Compare these items',
+ })
+ expect(userMessage?.parts[1]).toEqual({
+ type: 'image',
+ source: {
+ type: 'data',
+ value: 'base64Image1',
+ mimeType: 'image/jpeg',
+ },
+ })
+ expect(userMessage?.parts[2]).toEqual({
+ type: 'image',
+ source: {
+ type: 'url',
+ value: 'https://example.com/image2.png',
+ },
+ })
+ expect(userMessage?.parts[3]).toEqual({
+ type: 'text',
+ content: 'Which one is better?',
+ })
+ })
+
+ it('should use custom message id when provided in multimodal message', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: [{ type: 'text', content: 'Hello' }],
+ id: 'custom-multimodal-id-123',
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ expect(result.current.messages[0]?.id).toBe('custom-multimodal-id-123')
+ })
+
+ it('should send string content as simple text message via multimodal interface', async () => {
+ const chunks = createTextChunks('Response')
+ const adapter = createMockConnectionAdapter({ chunks })
+ const { result } = renderUseChat({ connection: adapter })
+
+ await result.current.sendMessage({
+ content: 'Hello world',
+ })
+
+ await waitFor(() => {
+ expect(result.current.messages.length).toBeGreaterThan(0)
+ })
+
+ const userMessage = result.current.messages.find((m) => m.role === 'user')
+ expect(userMessage?.parts).toEqual([
+ { type: 'text', content: 'Hello world' },
+ ])
+ })
+ })
})
diff --git a/packages/typescript/ai-solid/src/types.ts b/packages/typescript/ai-solid/src/types.ts
index 050fb8010..4d75ae531 100644
--- a/packages/typescript/ai-solid/src/types.ts
+++ b/packages/typescript/ai-solid/src/types.ts
@@ -3,12 +3,13 @@ import type {
ChatClientOptions,
ChatClientState,
ChatRequestBody,
+ MultimodalContent,
UIMessage,
} from '@tanstack/ai-client'
import type { Accessor } from 'solid-js'
// Re-export types from ai-client
-export type { ChatRequestBody, UIMessage }
+export type { ChatRequestBody, MultimodalContent, UIMessage }
/**
* Options for the useChat hook.
@@ -41,9 +42,10 @@ export interface UseChatReturn<
messages: Accessor>>
/**
- * Send a message and get a response
+ * Send a message and get a response.
+ * Can be a simple string or multimodal content with images, audio, etc.
*/
- sendMessage: (content: string) => Promise
+ sendMessage: (content: string | MultimodalContent) => Promise
/**
* Append a message to the conversation
diff --git a/packages/typescript/ai-solid/src/use-chat.ts b/packages/typescript/ai-solid/src/use-chat.ts
index e178a2c5e..77d0edf96 100644
--- a/packages/typescript/ai-solid/src/use-chat.ts
+++ b/packages/typescript/ai-solid/src/use-chat.ts
@@ -8,7 +8,12 @@ import {
import { ChatClient } from '@tanstack/ai-client'
import type { ChatClientState } from '@tanstack/ai-client'
import type { AnyClientTool, ModelMessage } from '@tanstack/ai'
-import type { UIMessage, UseChatOptions, UseChatReturn } from './types'
+import type {
+ MultimodalContent,
+ UIMessage,
+ UseChatOptions,
+ UseChatReturn,
+} from './types'
export function useChat = any>(
options: UseChatOptions = {} as UseChatOptions,
@@ -93,7 +98,7 @@ export function useChat = any>(
// are captured at client creation time. Changes to these callbacks require
// remounting the component or changing the connection to recreate the client.
- const sendMessage = async (content: string) => {
+ const sendMessage = async (content: string | MultimodalContent) => {
await client().sendMessage(content)
}
diff --git a/packages/typescript/ai-svelte/src/create-chat.svelte.ts b/packages/typescript/ai-svelte/src/create-chat.svelte.ts
index 07242d018..5354ae113 100644
--- a/packages/typescript/ai-svelte/src/create-chat.svelte.ts
+++ b/packages/typescript/ai-svelte/src/create-chat.svelte.ts
@@ -1,7 +1,12 @@
import { ChatClient } from '@tanstack/ai-client'
import type { ChatClientState } from '@tanstack/ai-client'
import type { AnyClientTool, ModelMessage } from '@tanstack/ai'
-import type { CreateChatOptions, CreateChatReturn, UIMessage } from './types'
+import type {
+ CreateChatOptions,
+ CreateChatReturn,
+ MultimodalContent,
+ UIMessage,
+} from './types'
/**
* Creates a reactive chat instance for Svelte 5.
@@ -83,7 +88,7 @@ export function createChat = any>(
// Users should call chat.stop() in their component's cleanup if needed.
// Define methods
- const sendMessage = async (content: string) => {
+ const sendMessage = async (content: string | MultimodalContent) => {
await client.sendMessage(content)
}
diff --git a/packages/typescript/ai-svelte/src/types.ts b/packages/typescript/ai-svelte/src/types.ts
index e18e87721..df13ed0a0 100644
--- a/packages/typescript/ai-svelte/src/types.ts
+++ b/packages/typescript/ai-svelte/src/types.ts
@@ -3,11 +3,12 @@ import type {
ChatClientOptions,
ChatClientState,
ChatRequestBody,
+ MultimodalContent,
UIMessage,
} from '@tanstack/ai-client'
// Re-export types from ai-client
-export type { ChatRequestBody, UIMessage }
+export type { ChatRequestBody, MultimodalContent, UIMessage }
/**
* Options for the createChat function.
@@ -41,9 +42,10 @@ export interface CreateChatReturn<
readonly messages: Array>
/**
- * Send a message and get a response
+ * Send a message and get a response.
+ * Can be a simple string or multimodal content with images, audio, etc.
*/
- sendMessage: (content: string) => Promise
+ sendMessage: (content: string | MultimodalContent) => Promise
/**
* Append a message to the conversation
diff --git a/packages/typescript/ai-vue/src/types.ts b/packages/typescript/ai-vue/src/types.ts
index 7c09f103a..31c948503 100644
--- a/packages/typescript/ai-vue/src/types.ts
+++ b/packages/typescript/ai-vue/src/types.ts
@@ -3,12 +3,13 @@ import type {
ChatClientOptions,
ChatClientState,
ChatRequestBody,
+ MultimodalContent,
UIMessage,
} from '@tanstack/ai-client'
import type { DeepReadonly, ShallowRef } from 'vue'
// Re-export types from ai-client
-export type { ChatRequestBody, UIMessage }
+export type { ChatRequestBody, MultimodalContent, UIMessage }
/**
* Options for the useChat composable.
@@ -41,9 +42,10 @@ export interface UseChatReturn<
messages: DeepReadonly>>>
/**
- * Send a message and get a response
+ * Send a message and get a response.
+ * Can be a simple string or multimodal content with images, audio, etc.
*/
- sendMessage: (content: string) => Promise
+ sendMessage: (content: string | MultimodalContent) => Promise
/**
* Append a message to the conversation
diff --git a/packages/typescript/ai-vue/src/use-chat.ts b/packages/typescript/ai-vue/src/use-chat.ts
index 684d3f800..6042fc535 100644
--- a/packages/typescript/ai-vue/src/use-chat.ts
+++ b/packages/typescript/ai-vue/src/use-chat.ts
@@ -2,7 +2,12 @@ import { ChatClient } from '@tanstack/ai-client'
import { onScopeDispose, readonly, shallowRef, useId, watch } from 'vue'
import type { AnyClientTool, ModelMessage } from '@tanstack/ai'
import type { ChatClientState } from '@tanstack/ai-client'
-import type { UIMessage, UseChatOptions, UseChatReturn } from './types'
+import type {
+ MultimodalContent,
+ UIMessage,
+ UseChatOptions,
+ UseChatReturn,
+} from './types'
export function useChat = any>(
options: UseChatOptions = {} as UseChatOptions,
@@ -66,7 +71,7 @@ export function useChat = any>(
// are captured at client creation time. Changes to these callbacks require
// remounting the component or changing the connection to recreate the client.
- const sendMessage = async (content: string) => {
+ const sendMessage = async (content: string | MultimodalContent) => {
await client.sendMessage(content)
}
diff --git a/packages/typescript/ai/src/activities/chat/messages.ts b/packages/typescript/ai/src/activities/chat/messages.ts
index 14c8dc621..8c9511ec7 100644
--- a/packages/typescript/ai/src/activities/chat/messages.ts
+++ b/packages/typescript/ai/src/activities/chat/messages.ts
@@ -1,16 +1,34 @@
import type {
+ AudioPart,
ContentPart,
+ DocumentPart,
+ ImagePart,
MessagePart,
ModelMessage,
TextPart,
ToolCallPart,
ToolResultPart,
UIMessage,
+ VideoPart,
} from '../../types'
// ===========================
// Message Converters
// ===========================
+/**
+ * Helper to check if a part is a multimodal content part (image, audio, video, document)
+ */
+function isMultimodalPart(
+ part: MessagePart,
+): part is ImagePart | AudioPart | VideoPart | DocumentPart {
+ return (
+ part.type === 'image' ||
+ part.type === 'audio' ||
+ part.type === 'video' ||
+ part.type === 'document'
+ )
+}
+
/**
* Helper to extract text content from string or ContentPart array
* For multimodal content, this extracts only the text parts
@@ -52,7 +70,8 @@ export function convertMessagesToModelMessages(
* Convert a UIMessage to ModelMessage(s)
*
* This conversion handles the parts-based structure:
- * - Text parts → content field
+ * - Text parts → content field (string or as part of ContentPart array)
+ * - Multimodal parts (image, audio, video, document) → ContentPart array
* - ToolCall parts → toolCalls array
* - ToolResult parts → separate role="tool" messages
*
@@ -72,12 +91,17 @@ export function uiMessageToModelMessages(
// Separate parts by type
// Note: thinking parts are UI-only and not included in ModelMessages
const textParts: Array = []
+ const multimodalParts: Array<
+ ImagePart | AudioPart | VideoPart | DocumentPart
+ > = []
const toolCallParts: Array = []
const toolResultParts: Array = []
for (const part of uiMessage.parts) {
if (part.type === 'text') {
textParts.push(part)
+ } else if (isMultimodalPart(part)) {
+ multimodalParts.push(part)
} else if (part.type === 'tool-call') {
toolCallParts.push(part)
} else if (part.type === 'tool-result') {
@@ -86,8 +110,26 @@ export function uiMessageToModelMessages(
// thinking parts are skipped - they're UI-only
}
- // Build the main message (user or assistant)
- const content = textParts.map((p) => p.content).join('') || null
+ // Build the content field
+ // If we have multimodal parts, use ContentPart array format
+ // Otherwise, use simple string format for backward compatibility
+ let content: string | null | Array
+ if (multimodalParts.length > 0) {
+ // Build ContentPart array preserving the order of text and multimodal parts
+ const contentParts: Array = []
+ for (const part of uiMessage.parts) {
+ if (part.type === 'text') {
+ contentParts.push(part)
+ } else if (isMultimodalPart(part)) {
+ contentParts.push(part)
+ }
+ }
+ content = contentParts
+ } else {
+ // Simple string content for text-only messages
+ content = textParts.map((p) => p.content).join('') || null
+ }
+
const toolCalls =
toolCallParts.length > 0
? toolCallParts
@@ -108,7 +150,9 @@ export function uiMessageToModelMessages(
: undefined
// Create the main message
- if (uiMessage.role !== 'assistant' || content || !toolCalls) {
+ // For multimodal content, we always create a message even if content is an empty array
+ const hasContent = Array.isArray(content) ? true : content !== null
+ if (uiMessage.role !== 'assistant' || hasContent || !toolCalls) {
messageList.push({
role: uiMessage.role,
content,
diff --git a/packages/typescript/ai/src/activities/chat/stream/processor.ts b/packages/typescript/ai/src/activities/chat/stream/processor.ts
index 0f480f9d3..0ce10309f 100644
--- a/packages/typescript/ai/src/activities/chat/stream/processor.ts
+++ b/packages/typescript/ai/src/activities/chat/stream/processor.ts
@@ -35,6 +35,8 @@ import type {
ToolResultState,
} from './types'
import type {
+ ContentPart,
+ MessagePart,
ModelMessage,
StreamChunk,
ToolCall,
@@ -164,13 +166,42 @@ export class StreamProcessor {
}
/**
- * Add a user message to the conversation
- */
- addUserMessage(content: string): UIMessage {
+ * Add a user message to the conversation.
+ * Supports both simple string content and multimodal content arrays.
+ *
+ * @param content - The message content (string or array of content parts)
+ * @param id - Optional custom message ID (generated if not provided)
+ * @returns The created UIMessage
+ *
+ * @example
+ * ```ts
+ * // Simple text message
+ * processor.addUserMessage('Hello!')
+ *
+ * // Multimodal message with image
+ * processor.addUserMessage([
+ * { type: 'text', content: 'What is in this image?' },
+ * { type: 'image', source: { type: 'url', value: 'https://example.com/photo.jpg' } }
+ * ])
+ *
+ * // With custom ID
+ * processor.addUserMessage('Hello!', 'custom-id-123')
+ * ```
+ */
+ addUserMessage(content: string | Array, id?: string): UIMessage {
+ // Convert content to message parts
+ const parts: Array =
+ typeof content === 'string'
+ ? [{ type: 'text', content }]
+ : content.map((part) => {
+ // ContentPart types (text, image, audio, video, document) are compatible with MessagePart
+ return part as MessagePart
+ })
+
const userMessage: UIMessage = {
- id: generateMessageId(),
+ id: id ?? generateMessageId(),
role: 'user',
- parts: [{ type: 'text', content }],
+ parts,
createdAt: new Date(),
}
diff --git a/packages/typescript/ai/src/index.ts b/packages/typescript/ai/src/index.ts
index 48ae1164e..92263e37e 100644
--- a/packages/typescript/ai/src/index.ts
+++ b/packages/typescript/ai/src/index.ts
@@ -73,6 +73,9 @@ export {
// All types
export * from './types'
+// Utility functions
+export { detectImageMimeType } from './utils'
+
// Event client + event types
export * from './event-client'
diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts
index 7bd3c52d9..344034f34 100644
--- a/packages/typescript/ai/src/types.ts
+++ b/packages/typescript/ai/src/types.ts
@@ -108,24 +108,52 @@ export interface ToolCall {
export type Modality = 'text' | 'image' | 'audio' | 'video' | 'document'
/**
- * Source specification for multimodal content.
- * Supports both inline data (base64) and URL-based content.
+ * Source specification for inline data content (base64).
+ * Requires a mimeType to ensure providers receive proper content type information.
+ */
+export interface ContentPartDataSource {
+ /**
+ * Indicates this is inline data content.
+ */
+ type: 'data'
+ /**
+ * The base64-encoded content value.
+ */
+ value: string
+ /**
+ * The MIME type of the content (e.g., 'image/png', 'audio/wav').
+ * Required for data sources to ensure proper handling by providers.
+ */
+ mimeType: string
+}
+
+/**
+ * Source specification for URL-based content.
+ * mimeType is optional as it can often be inferred from the URL or response headers.
*/
-export interface ContentPartSource {
+export interface ContentPartUrlSource {
/**
- * The type of source:
- * - 'data': Inline data (typically base64 encoded)
- * - 'url': URL reference to the content
+ * Indicates this is URL-referenced content.
*/
- type: 'data' | 'url'
+ type: 'url'
/**
- * The actual content value:
- * - For 'data': base64-encoded string
- * - For 'url': HTTP(S) URL or data URI
+ * HTTP(S) URL or data URI pointing to the content.
*/
value: string
+ /**
+ * Optional MIME type hint for cases where providers can't infer it from the URL.
+ */
+ mimeType?: string
}
+/**
+ * Source specification for multimodal content.
+ * Discriminated union supporting both inline data (base64) and URL-based content.
+ * - For 'data' sources: mimeType is required
+ * - For 'url' sources: mimeType is optional
+ */
+export type ContentPartSource = ContentPartDataSource | ContentPartUrlSource
+
/**
* Image content part for multimodal messages.
* @template TMetadata - Provider-specific metadata type (e.g., OpenAI's detail level)
@@ -282,6 +310,10 @@ export interface ThinkingPart {
export type MessagePart =
| TextPart
+ | ImagePart
+ | AudioPart
+ | VideoPart
+ | DocumentPart
| ToolCallPart
| ToolResultPart
| ThinkingPart
diff --git a/packages/typescript/ai/src/utils.ts b/packages/typescript/ai/src/utils.ts
new file mode 100644
index 000000000..e89eb1db1
--- /dev/null
+++ b/packages/typescript/ai/src/utils.ts
@@ -0,0 +1,41 @@
+/**
+ * Detect image mime type from base64 data using magic bytes.
+ * Returns undefined if the format cannot be detected.
+ *
+ * This function analyzes the first few bytes of base64-encoded image data
+ * to determine the image format based on file signature (magic bytes).
+ *
+ * @param base64Data - The base64-encoded image data
+ * @returns The detected mime type, or undefined if unrecognized
+ *
+ * @example
+ * ```ts
+ * const mimeType = detectImageMimeType(imageBase64)
+ * // Returns 'image/jpeg', 'image/png', 'image/gif', 'image/webp', or undefined
+ * ```
+ */
+export function detectImageMimeType(
+ base64Data: string,
+): 'image/jpeg' | 'image/png' | 'image/gif' | 'image/webp' | undefined {
+ // Get first few bytes (base64 encoded)
+ const prefix = base64Data.substring(0, 20)
+
+ // JPEG: starts with /9j/ (FFD8FF in base64)
+ if (prefix.startsWith('/9j/')) {
+ return 'image/jpeg'
+ }
+ // PNG: starts with iVBORw0KGgo (89504E47 in base64)
+ if (prefix.startsWith('iVBORw0KGgo')) {
+ return 'image/png'
+ }
+ // GIF: starts with R0lGOD (474946 in base64)
+ if (prefix.startsWith('R0lGOD')) {
+ return 'image/gif'
+ }
+ // WebP: starts with UklGR (52494646 in base64, followed by WEBP)
+ if (prefix.startsWith('UklGR')) {
+ return 'image/webp'
+ }
+
+ return undefined
+}
diff --git a/packages/typescript/ai/tests/message-converters.test.ts b/packages/typescript/ai/tests/message-converters.test.ts
new file mode 100644
index 000000000..60df58ec0
--- /dev/null
+++ b/packages/typescript/ai/tests/message-converters.test.ts
@@ -0,0 +1,397 @@
+import { describe, expect, it } from 'vitest'
+import {
+ modelMessageToUIMessage,
+ uiMessageToModelMessages,
+} from '../src/activities/chat/messages'
+import type { ContentPart, ModelMessage, UIMessage } from '../src/types'
+
+describe('Message Converters', () => {
+ describe('uiMessageToModelMessages', () => {
+ it('should convert simple text message', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [{ type: 'text', content: 'Hello' }],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result).toEqual([
+ {
+ role: 'user',
+ content: 'Hello',
+ },
+ ])
+ })
+
+ it('should convert multiple text parts to single string', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ { type: 'text', content: 'Hello ' },
+ { type: 'text', content: 'world!' },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result).toEqual([
+ {
+ role: 'user',
+ content: 'Hello world!',
+ },
+ ])
+ })
+
+ it('should convert multimodal message with image to ContentPart array', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ { type: 'text', content: 'What is in this image?' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result.length).toBe(1)
+ expect(result[0]?.role).toBe('user')
+ expect(Array.isArray(result[0]?.content)).toBe(true)
+
+ const contentParts = result[0]?.content as Array
+ expect(contentParts.length).toBe(2)
+ expect(contentParts[0]).toEqual({
+ type: 'text',
+ content: 'What is in this image?',
+ })
+ expect(contentParts[1]).toEqual({
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ })
+ })
+
+ it('should convert multimodal message with audio', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ { type: 'text', content: 'Transcribe this' },
+ {
+ type: 'audio',
+ source: {
+ type: 'data',
+ value: 'base64audio',
+ mimeType: 'audio/mp3',
+ },
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ const contentParts = result[0]?.content as Array
+ expect(contentParts[1]).toEqual({
+ type: 'audio',
+ source: { type: 'data', value: 'base64audio', mimeType: 'audio/mp3' },
+ })
+ })
+
+ it('should convert multimodal message with video', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ { type: 'text', content: 'Describe this video' },
+ {
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/video.mp4' },
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ const contentParts = result[0]?.content as Array
+ expect(contentParts[1]).toEqual({
+ type: 'video',
+ source: { type: 'url', value: 'https://example.com/video.mp4' },
+ })
+ })
+
+ it('should convert multimodal message with document', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ { type: 'text', content: 'Summarize this document' },
+ {
+ type: 'document',
+ source: {
+ type: 'data',
+ value: 'base64pdf',
+ mimeType: 'application/pdf',
+ },
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ const contentParts = result[0]?.content as Array
+ expect(contentParts[1]).toEqual({
+ type: 'document',
+ source: {
+ type: 'data',
+ value: 'base64pdf',
+ mimeType: 'application/pdf',
+ },
+ })
+ })
+
+ it('should preserve order of text and multimodal parts', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/img1.jpg' },
+ },
+ { type: 'text', content: 'First image above' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/img2.jpg' },
+ },
+ { type: 'text', content: 'Second image above' },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ const contentParts = result[0]?.content as Array
+ expect(contentParts.length).toBe(4)
+ expect(contentParts[0]?.type).toBe('image')
+ expect(contentParts[1]?.type).toBe('text')
+ expect(contentParts[2]?.type).toBe('image')
+ expect(contentParts[3]?.type).toBe('text')
+ })
+
+ it('should skip thinking parts in conversion', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'assistant',
+ parts: [
+ { type: 'thinking', content: 'Let me think...' },
+ { type: 'text', content: 'Here is my answer' },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result.length).toBe(1)
+ expect(result[0]?.content).toBe('Here is my answer')
+ })
+
+ it('should skip system messages', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'system',
+ parts: [{ type: 'text', content: 'You are a helpful assistant' }],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result).toEqual([])
+ })
+
+ it('should handle text-only message without multimodal parts as string content', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [{ type: 'text', content: 'Just text' }],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ // Should be string, not array
+ expect(typeof result[0]?.content).toBe('string')
+ expect(result[0]?.content).toBe('Just text')
+ })
+
+ it('should handle empty parts array', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result.length).toBe(1)
+ expect(result[0]?.content).toBe(null)
+ })
+
+ it('should handle multimodal message with only image (no text)', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(Array.isArray(result[0]?.content)).toBe(true)
+ const contentParts = result[0]?.content as Array
+ expect(contentParts.length).toBe(1)
+ expect(contentParts[0]?.type).toBe('image')
+ })
+
+ it('should include metadata in multimodal parts', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'user',
+ parts: [
+ { type: 'text', content: 'Analyze' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ metadata: { detail: 'high' },
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ const contentParts = result[0]?.content as Array
+ expect(contentParts[1]).toEqual({
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/cat.jpg' },
+ metadata: { detail: 'high' },
+ })
+ })
+
+ it('should handle tool call parts', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'assistant',
+ parts: [
+ {
+ type: 'tool-call',
+ id: 'tool-1',
+ name: 'getWeather',
+ arguments: '{"city": "NYC"}',
+ state: 'input-complete',
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ expect(result[0]?.toolCalls).toBeDefined()
+ expect(result[0]?.toolCalls?.length).toBe(1)
+ expect(result[0]?.toolCalls?.[0]).toEqual({
+ id: 'tool-1',
+ type: 'function',
+ function: {
+ name: 'getWeather',
+ arguments: '{"city": "NYC"}',
+ },
+ })
+ })
+
+ it('should handle tool result parts', () => {
+ const uiMessage: UIMessage = {
+ id: 'msg-1',
+ role: 'assistant',
+ parts: [
+ {
+ type: 'tool-result',
+ toolCallId: 'tool-1',
+ content: '{"temp": 72}',
+ state: 'complete',
+ },
+ ],
+ }
+
+ const result = uiMessageToModelMessages(uiMessage)
+
+ // Should have assistant message + tool message
+ expect(result.length).toBe(2)
+ expect(result[1]?.role).toBe('tool')
+ expect(result[1]?.toolCallId).toBe('tool-1')
+ expect(result[1]?.content).toBe('{"temp": 72}')
+ })
+ })
+
+ describe('modelMessageToUIMessage', () => {
+ it('should convert simple text ModelMessage', () => {
+ const modelMessage: ModelMessage = {
+ role: 'user',
+ content: 'Hello',
+ }
+
+ const result = modelMessageToUIMessage(modelMessage)
+
+ expect(result.role).toBe('user')
+ expect(result.parts).toEqual([{ type: 'text', content: 'Hello' }])
+ expect(result.id).toBeTruthy()
+ })
+
+ it('should use provided id', () => {
+ const modelMessage: ModelMessage = {
+ role: 'user',
+ content: 'Hello',
+ }
+
+ const result = modelMessageToUIMessage(modelMessage, 'custom-id')
+
+ expect(result.id).toBe('custom-id')
+ })
+
+ it('should convert multimodal content to text', () => {
+ const modelMessage: ModelMessage = {
+ role: 'user',
+ content: [
+ { type: 'text', content: 'What is this?' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/img.jpg' },
+ },
+ ],
+ }
+
+ const result = modelMessageToUIMessage(modelMessage)
+
+ // Currently, modelMessageToUIMessage only extracts text content
+ expect(result.parts).toEqual([{ type: 'text', content: 'What is this?' }])
+ })
+
+ it('should handle tool message', () => {
+ const modelMessage: ModelMessage = {
+ role: 'tool',
+ content: '{"result": "success"}',
+ toolCallId: 'tool-1',
+ }
+
+ const result = modelMessageToUIMessage(modelMessage)
+
+ expect(result.role).toBe('assistant') // Tool messages become assistant
+ expect(result.parts).toContainEqual({
+ type: 'tool-result',
+ toolCallId: 'tool-1',
+ content: '{"result": "success"}',
+ state: 'complete',
+ })
+ })
+ })
+})
diff --git a/packages/typescript/smoke-tests/adapters/fixtures/jpgfixture.jpg b/packages/typescript/smoke-tests/adapters/fixtures/jpgfixture.jpg
new file mode 100644
index 000000000..0a24df99d
Binary files /dev/null and b/packages/typescript/smoke-tests/adapters/fixtures/jpgfixture.jpg differ
diff --git a/packages/typescript/smoke-tests/adapters/fixtures/pngfixture.png b/packages/typescript/smoke-tests/adapters/fixtures/pngfixture.png
new file mode 100644
index 000000000..b82a6a355
Binary files /dev/null and b/packages/typescript/smoke-tests/adapters/fixtures/pngfixture.png differ
diff --git a/packages/typescript/smoke-tests/adapters/src/adapters/index.ts b/packages/typescript/smoke-tests/adapters/src/adapters/index.ts
index 532b3ed0e..25fb6a8a3 100644
--- a/packages/typescript/smoke-tests/adapters/src/adapters/index.ts
+++ b/packages/typescript/smoke-tests/adapters/src/adapters/index.ts
@@ -83,7 +83,7 @@ const GEMINI_TTS_MODEL =
const OLLAMA_MODEL = process.env.OLLAMA_MODEL || 'mistral:7b'
const OLLAMA_SUMMARY_MODEL = process.env.OLLAMA_SUMMARY_MODEL || OLLAMA_MODEL
-const GROK_MODEL = process.env.GROK_MODEL || 'grok-3'
+const GROK_MODEL = process.env.GROK_MODEL || 'grok-4'
const GROK_SUMMARY_MODEL = process.env.GROK_SUMMARY_MODEL || GROK_MODEL
const GROK_IMAGE_MODEL = process.env.GROK_IMAGE_MODEL || 'grok-2-image-1212'
diff --git a/packages/typescript/smoke-tests/adapters/src/tests/index.ts b/packages/typescript/smoke-tests/adapters/src/tests/index.ts
index e3b24cb60..8c4f8bee9 100644
--- a/packages/typescript/smoke-tests/adapters/src/tests/index.ts
+++ b/packages/typescript/smoke-tests/adapters/src/tests/index.ts
@@ -1,5 +1,3 @@
-import type { AdapterContext, TestOutcome } from '../harness'
-
// Import all test runners
import { runCST } from './cst-chat-stream'
import { runOST } from './ost-one-shot-text'
@@ -12,6 +10,9 @@ import { runSMS } from './sms-summarize-stream'
import { runIMG } from './img-image-generation'
import { runTTS } from './tts-text-to-speech'
import { runTRN } from './trn-transcription'
+import { runMMJ, runMMP } from './mmi-multimodal-image'
+import { runMMS, runMMT } from './mms-multimodal-structured'
+import type { AdapterContext, TestOutcome } from '../harness'
/**
* Adapter capability types
@@ -36,7 +37,7 @@ export interface TestDefinition {
/** Function to run the test */
run: (ctx: AdapterContext) => Promise
/** Required adapter capabilities (defaults to ['text']) */
- requires: AdapterCapability[]
+ requires: Array
/** If true, test is skipped unless explicitly requested */
skipByDefault?: boolean
}
@@ -44,7 +45,7 @@ export interface TestDefinition {
/**
* Registry of all available tests
*/
-export const TESTS: TestDefinition[] = [
+export const TESTS: Array = [
{
id: 'CST',
name: 'Chat Stream',
@@ -125,6 +126,35 @@ export const TESTS: TestDefinition[] = [
requires: ['transcription'],
skipByDefault: true, // Skip unless explicitly requested
},
+ {
+ id: 'MMJ',
+ name: 'Multimodal JPEG',
+ description:
+ 'Describe a JPEG image (meme with man, React icon, code/email text)',
+ run: runMMJ,
+ requires: ['text'],
+ },
+ {
+ id: 'MMP',
+ name: 'Multimodal PNG',
+ description: 'Describe a PNG image (beach scene with AG UI text)',
+ run: runMMP,
+ requires: ['text'],
+ },
+ {
+ id: 'MMS',
+ name: 'Multimodal Structured JPEG',
+ description: 'Describe a JPEG image with structured JSON output',
+ run: runMMS,
+ requires: ['text'],
+ },
+ {
+ id: 'MMT',
+ name: 'Multimodal Structured PNG',
+ description: 'Describe a PNG image with structured JSON output',
+ run: runMMT,
+ requires: ['text'],
+ },
]
/**
@@ -137,13 +167,13 @@ export function getTest(id: string): TestDefinition | undefined {
/**
* Get all test IDs
*/
-export function getTestIds(): string[] {
+export function getTestIds(): Array {
return TESTS.map((t) => t.id)
}
/**
* Get tests that run by default (excluding skipByDefault tests)
*/
-export function getDefaultTests(): TestDefinition[] {
+export function getDefaultTests(): Array {
return TESTS.filter((t) => !t.skipByDefault)
}
diff --git a/packages/typescript/smoke-tests/adapters/src/tests/mmi-multimodal-image.ts b/packages/typescript/smoke-tests/adapters/src/tests/mmi-multimodal-image.ts
new file mode 100644
index 000000000..15bdcac92
--- /dev/null
+++ b/packages/typescript/smoke-tests/adapters/src/tests/mmi-multimodal-image.ts
@@ -0,0 +1,193 @@
+import { readFile } from 'node:fs/promises'
+import { join } from 'node:path'
+import { runTestCase } from '../harness'
+import type { AdapterContext, TestOutcome } from '../harness'
+import type { ContentPart } from '@tanstack/ai'
+
+/**
+ * Detect image mime type from file extension
+ */
+function getMimeType(filename: string): string {
+ const ext = filename.toLowerCase().split('.').pop()
+ switch (ext) {
+ case 'jpg':
+ case 'jpeg':
+ return 'image/jpeg'
+ case 'png':
+ return 'image/png'
+ case 'gif':
+ return 'image/gif'
+ case 'webp':
+ return 'image/webp'
+ default:
+ return 'image/jpeg'
+ }
+}
+
+/**
+ * MMJ: Multimodal Image JPEG Test
+ *
+ * Tests multimodal image support by sending a JPEG image
+ * and asking the model to describe it.
+ * The image shows a man pointing towards a React icon with text
+ * "MY CODE" and "IS THIS AN EMAIL?" (meme format).
+ */
+export async function runMMJ(
+ adapterContext: AdapterContext,
+): Promise {
+ const testName = 'mmj-multimodal-jpeg'
+ const adapterName = adapterContext.adapterName
+ const fixtureFile = 'jpgfixture.jpg'
+ const fixturePath = join(process.cwd(), 'fixtures', fixtureFile)
+
+ // Try to load the image file
+ let imageBase64: string
+ try {
+ const imageBuffer = await readFile(fixturePath)
+ imageBase64 = imageBuffer.toString('base64')
+ } catch {
+ console.log(
+ `[${adapterName}] — ${testName}: Ignored (no fixture file at fixtures/${fixtureFile})`,
+ )
+ return { passed: true, ignored: true }
+ }
+
+ const mimeType = getMimeType(fixtureFile)
+
+ // Build multimodal content
+ const contentParts: Array = [
+ { type: 'text', content: 'Describe this image' },
+ {
+ type: 'image',
+ source: { type: 'data', value: imageBase64, mimeType },
+ },
+ ]
+
+ return runTestCase({
+ adapterContext,
+ testName,
+ description:
+ 'JPEG image description mentions man/person, React icon, and meme text',
+ messages: [{ role: 'user' as const, content: contentParts }],
+ validate: (run) => {
+ const response = run.fullResponse.toLowerCase()
+
+ // Check for person/man/character
+ const hasPerson =
+ response.includes('man') ||
+ response.includes('person') ||
+ response.includes('guy') ||
+ response.includes('someone') ||
+ response.includes('character') ||
+ response.includes('hand') ||
+ response.includes('figure')
+
+ // Check for React icon/logo
+ const hasReact =
+ response.includes('react') ||
+ response.includes('logo') ||
+ response.includes('icon') ||
+ response.includes('atom')
+
+ // Check for meme text content
+ const hasCodeText =
+ response.includes('code') || response.includes('my code')
+ const hasEmailText =
+ response.includes('email') || response.includes('is this an email')
+
+ const passed =
+ hasPerson ||
+ hasReact ||
+ hasCodeText ||
+ hasEmailText ||
+ response.includes('image')
+
+ return {
+ passed,
+ error: passed
+ ? undefined
+ : `Response missing expected content. hasPerson=${hasPerson}, hasReact=${hasReact}, hasCodeText=${hasCodeText}, hasEmailText=${hasEmailText}, or mentions "image"`,
+ meta: {
+ hasPerson,
+ hasReact,
+ hasCodeText,
+ hasEmailText,
+ responseLength: response.length,
+ },
+ }
+ },
+ })
+}
+
+/**
+ * MMP: Multimodal Image PNG Test
+ *
+ * Tests multimodal image support by sending a PNG image
+ * and asking the model to describe it.
+ * The image shows a beach scene with "AG UI READY" text.
+ * Expects the response to mention at least one of: beach, sea/ocean, or AG UI text.
+ */
+export async function runMMP(
+ adapterContext: AdapterContext,
+): Promise {
+ const testName = 'mmp-multimodal-png'
+ const adapterName = adapterContext.adapterName
+ const fixtureFile = 'pngfixture.png'
+ const fixturePath = join(process.cwd(), 'fixtures', fixtureFile)
+
+ // Try to load the image file
+ let imageBase64: string
+ try {
+ const imageBuffer = await readFile(fixturePath)
+ imageBase64 = imageBuffer.toString('base64')
+ } catch {
+ console.log(
+ `[${adapterName}] — ${testName}: Ignored (no fixture file at fixtures/${fixtureFile})`,
+ )
+ return { passed: true, ignored: true }
+ }
+
+ const mimeType = getMimeType(fixtureFile)
+
+ // Build multimodal content
+ const contentParts: Array = [
+ { type: 'text', content: 'Describe this image' },
+ {
+ type: 'image',
+ source: { type: 'data', value: imageBase64, mimeType },
+ },
+ ]
+
+ return runTestCase({
+ adapterContext,
+ testName,
+ description:
+ 'PNG image description mentions beach, sea, or AG UI text (at least one)',
+ messages: [{ role: 'user' as const, content: contentParts }],
+ validate: (run) => {
+ const response = run.fullResponse.toLowerCase()
+
+ const hasBeach = response.includes('beach')
+ const hasSea =
+ response.includes('sea') ||
+ response.includes('ocean') ||
+ response.includes('water')
+ const hasAgUi =
+ response.includes('ag ui') ||
+ response.includes('ag-ui') ||
+ response.includes('agui') ||
+ response.includes('ready')
+
+ // Pass if at least one of the expected elements is mentioned
+ const passed = hasBeach || hasSea || hasAgUi || response.includes('image')
+
+ return {
+ passed,
+ error: passed
+ ? undefined
+ : `Response missing expected content. Need at least one of: hasBeach=${hasBeach}, hasSea=${hasSea}, hasAgUi=${hasAgUi}, or mentions "image"`,
+ meta: { hasBeach, hasSea, hasAgUi, responseLength: response.length },
+ }
+ },
+ })
+}
diff --git a/packages/typescript/smoke-tests/adapters/src/tests/mms-multimodal-structured.ts b/packages/typescript/smoke-tests/adapters/src/tests/mms-multimodal-structured.ts
new file mode 100644
index 000000000..12bc9a0b5
--- /dev/null
+++ b/packages/typescript/smoke-tests/adapters/src/tests/mms-multimodal-structured.ts
@@ -0,0 +1,248 @@
+import { readFile } from 'node:fs/promises'
+import { join } from 'node:path'
+import { runTestCase } from '../harness'
+import type { AdapterContext, TestOutcome } from '../harness'
+import type { ContentPart } from '@tanstack/ai'
+
+/**
+ * Detect image mime type from file extension
+ */
+function getMimeType(filename: string): string {
+ const ext = filename.toLowerCase().split('.').pop()
+ switch (ext) {
+ case 'jpg':
+ case 'jpeg':
+ return 'image/jpeg'
+ case 'png':
+ return 'image/png'
+ case 'gif':
+ return 'image/gif'
+ case 'webp':
+ return 'image/webp'
+ default:
+ return 'image/jpeg'
+ }
+}
+
+/**
+ * JSON Schema prompt for structured image description
+ */
+const STRUCTURED_PROMPT = `Analyze this image and provide a structured description. Return ONLY valid JSON (no markdown code blocks) matching this schema:
+{
+ "description": "A brief description of what the image shows",
+ "hasText": true/false,
+ "textContent": "The text content visible in the image, if any",
+ "mainSubject": "The main subject or focal point of the image",
+ "colors": ["array", "of", "primary", "colors"]
+}`
+
+interface ImageDescription {
+ description: string
+ hasText: boolean
+ textContent?: string
+ mainSubject: string
+ colors: Array
+}
+
+/**
+ * MMS: Multimodal Structured JPEG Test
+ *
+ * Tests multimodal image support with structured output by sending a JPEG image
+ * and asking the model to describe it using a JSON schema.
+ * The image shows a man pointing towards a React icon with text
+ * "MY CODE" and "IS THIS AN EMAIL?" (meme format).
+ */
+export async function runMMS(
+ adapterContext: AdapterContext,
+): Promise {
+ const testName = 'mms-multimodal-structured-jpeg'
+ const adapterName = adapterContext.adapterName
+ const fixtureFile = 'jpgfixture.jpg'
+ const fixturePath = join(process.cwd(), 'fixtures', fixtureFile)
+
+ // Try to load the image file
+ let imageBase64: string
+ try {
+ const imageBuffer = await readFile(fixturePath)
+ imageBase64 = imageBuffer.toString('base64')
+ } catch {
+ console.log(
+ `[${adapterName}] — ${testName}: Ignored (no fixture file at fixtures/${fixtureFile})`,
+ )
+ return { passed: true, ignored: true }
+ }
+
+ const mimeType = getMimeType(fixtureFile)
+
+ // Build multimodal content with structured output request
+ const contentParts: Array = [
+ {
+ type: 'text',
+ content: STRUCTURED_PROMPT,
+ },
+ {
+ type: 'image',
+ source: { type: 'data', value: imageBase64, mimeType },
+ },
+ ]
+
+ return runTestCase({
+ adapterContext,
+ testName,
+ description:
+ 'JPEG image with structured output returns valid JSON with description, hasText, mainSubject, colors',
+ messages: [{ role: 'user' as const, content: contentParts }],
+ validate: (run) => {
+ const response = run.fullResponse
+
+ // Try to parse as JSON
+ let parsed: ImageDescription | null = null
+ try {
+ // Try to extract JSON from response (might be wrapped in markdown code blocks)
+ const jsonMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/)
+ const jsonStr =
+ jsonMatch && jsonMatch[1] ? jsonMatch[1].trim() : response.trim()
+ parsed = JSON.parse(jsonStr)
+ } catch {
+ // If direct parse fails, try the raw response
+ try {
+ parsed = JSON.parse(response)
+ } catch {
+ return {
+ passed: false,
+ error: `Failed to parse response as JSON: ${response.substring(0, 200)}`,
+ meta: { responseLength: response.length },
+ }
+ }
+ }
+
+ // Validate structure
+ const hasDescription =
+ typeof parsed?.description === 'string' && parsed.description.length > 0
+ const hasMainSubject =
+ typeof parsed?.mainSubject === 'string' && parsed.mainSubject.length > 0
+ const hasColors =
+ Array.isArray(parsed?.colors) && parsed.colors.length > 0
+ const hasTextBoolean = typeof parsed?.hasText === 'boolean'
+
+ const passed =
+ hasDescription && hasMainSubject && hasColors && hasTextBoolean
+
+ return {
+ passed,
+ error: passed
+ ? undefined
+ : `Structured output missing required fields. hasDescription=${hasDescription}, hasMainSubject=${hasMainSubject}, hasColors=${hasColors}, hasTextBoolean=${hasTextBoolean}`,
+ meta: {
+ hasDescription,
+ hasMainSubject,
+ hasColors,
+ hasTextBoolean,
+ parsed,
+ responseLength: response.length,
+ },
+ }
+ },
+ })
+}
+
+/**
+ * MMT: Multimodal Structured PNG Test
+ *
+ * Tests multimodal image support with structured output by sending a PNG image
+ * and asking the model to describe it using a JSON schema.
+ * The image shows a beach scene with "AG UI READY" text.
+ */
+export async function runMMT(
+ adapterContext: AdapterContext,
+): Promise {
+ const testName = 'mmt-multimodal-structured-png'
+ const adapterName = adapterContext.adapterName
+ const fixtureFile = 'pngfixture.png'
+ const fixturePath = join(process.cwd(), 'fixtures', fixtureFile)
+
+ // Try to load the image file
+ let imageBase64: string
+ try {
+ const imageBuffer = await readFile(fixturePath)
+ imageBase64 = imageBuffer.toString('base64')
+ } catch {
+ console.log(
+ `[${adapterName}] — ${testName}: Ignored (no fixture file at fixtures/${fixtureFile})`,
+ )
+ return { passed: true, ignored: true }
+ }
+
+ const mimeType = getMimeType(fixtureFile)
+
+ // Build multimodal content with structured output request
+ const contentParts: Array = [
+ {
+ type: 'text',
+ content: STRUCTURED_PROMPT,
+ },
+ {
+ type: 'image',
+ source: { type: 'data', value: imageBase64, mimeType },
+ },
+ ]
+
+ return runTestCase({
+ adapterContext,
+ testName,
+ description:
+ 'PNG image with structured output returns valid JSON with description, hasText, mainSubject, colors',
+ messages: [{ role: 'user' as const, content: contentParts }],
+ validate: (run) => {
+ const response = run.fullResponse
+
+ // Try to parse as JSON
+ let parsed: ImageDescription | null = null
+ try {
+ // Try to extract JSON from response (might be wrapped in markdown code blocks)
+ const jsonMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/)
+ const jsonStr =
+ jsonMatch && jsonMatch[1] ? jsonMatch[1].trim() : response.trim()
+ parsed = JSON.parse(jsonStr)
+ } catch {
+ // If direct parse fails, try the raw response
+ try {
+ parsed = JSON.parse(response)
+ } catch {
+ return {
+ passed: false,
+ error: `Failed to parse response as JSON: ${response.substring(0, 200)}`,
+ meta: { responseLength: response.length },
+ }
+ }
+ }
+
+ // Validate structure
+ const hasDescription =
+ typeof parsed?.description === 'string' && parsed.description.length > 0
+ const hasMainSubject =
+ typeof parsed?.mainSubject === 'string' && parsed.mainSubject.length > 0
+ const hasColors =
+ Array.isArray(parsed?.colors) && parsed.colors.length > 0
+ const hasTextBoolean = typeof parsed?.hasText === 'boolean'
+
+ const passed =
+ hasDescription && hasMainSubject && hasColors && hasTextBoolean
+
+ return {
+ passed,
+ error: passed
+ ? undefined
+ : `Structured output missing required fields. hasDescription=${hasDescription}, hasMainSubject=${hasMainSubject}, hasColors=${hasColors}, hasTextBoolean=${hasTextBoolean}`,
+ meta: {
+ hasDescription,
+ hasMainSubject,
+ hasColors,
+ hasTextBoolean,
+ parsed,
+ responseLength: response.length,
+ },
+ }
+ },
+ })
+}