Multimodal API
The DeepSeek Multimodal API enables you to work with both text and images in a single conversation, allowing for rich, context-aware interactions.
Overview
Our multimodal capabilities allow you to:
- Analyze and describe images
- Answer questions about visual content
- Generate text based on image inputs
- Combine text and image understanding in conversations
Supported Formats
Image Formats
- JPEG (.jpg, .jpeg)
- PNG (.png)
- GIF (.gif)
- WebP (.webp)
- BMP (.bmp)
Image Constraints
- Maximum file size: 20MB
- Maximum dimensions: 4096 x 4096 pixels
- Minimum dimensions: 32 x 32 pixels
Basic Usage
Image URL Input
python
from openai import OpenAI
client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://api.deepseek.com/v1"
)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What do you see in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.jpg"
}
}
]
}
],
max_tokens=500
)
print(response.choices[0].message.content)
Base64 Image Input
python
import base64
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Encode the image
base64_image = encode_image("path/to/your/image.jpg")
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in detail."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=800
)
Advanced Features
Multiple Images
You can include multiple images in a single request:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Compare these two images and tell me the differences."
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image1.jpg"}
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image2.jpg"}
}
]
}
],
max_tokens=1000
)
Image Detail Control
Control the level of detail in image processing:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this image in high detail."
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.jpg",
"detail": "high" # Options: "low", "high", "auto"
}
}
]
}
]
)
Detail Levels
- low: Faster processing, basic image understanding
- high: Detailed analysis, better for complex images
- auto: Automatically chooses based on image complexity (default)
Use Cases
Image Description
Generate detailed descriptions of images:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Provide a detailed description of this image, including objects, people, setting, and mood."
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/photo.jpg"}
}
]
}
]
)
Visual Question Answering
Ask specific questions about image content:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "How many people are in this image? What are they doing?"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/group_photo.jpg"}
}
]
}
]
)
OCR and Text Extraction
Extract and read text from images:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Extract all the text from this image and format it nicely."
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/document.jpg"}
}
]
}
]
)
Chart and Graph Analysis
Analyze data visualizations:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this chart. What trends do you see? Provide insights and key takeaways."
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/sales_chart.png"}
}
]
}
]
)
Product Analysis
Analyze products in images:
python
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this product image. Describe the product, its features, and suggest improvements."
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/product.jpg"}
}
]
}
]
)
Conversation Context
Multi-turn Conversations
Maintain context across multiple exchanges:
python
conversation = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What do you see in this image?"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/kitchen.jpg"}
}
]
},
{
"role": "assistant",
"content": "I see a modern kitchen with stainless steel appliances, granite countertops, and white cabinets..."
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What cooking equipment do you see that would be good for baking?"
}
]
}
]
response = client.chat.completions.create(
model="deepseek-chat",
messages=conversation
)
Image Reference in Follow-up
Reference previously shown images:
python
# First message with image
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this room."
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/room.jpg"}
}
]
}
]
# Get initial response
response1 = client.chat.completions.create(
model="deepseek-chat",
messages=messages
)
# Add assistant response to conversation
messages.append({
"role": "assistant",
"content": response1.choices[0].message.content
})
# Follow-up question about the same image
messages.append({
"role": "user",
"content": [
{
"type": "text",
"text": "What changes would you suggest to make this room more modern?"
}
]
})
response2 = client.chat.completions.create(
model="deepseek-chat",
messages=messages
)
Best Practices
Image Quality
- Use high-quality images: Better quality leads to better analysis
- Ensure good lighting: Well-lit images are easier to analyze
- Avoid blurry images: Sharp, clear images work best
- Consider composition: Well-framed subjects are easier to identify
Prompt Engineering
- Be specific: Ask clear, specific questions about the image
- Provide context: Give background information when helpful
- Use examples: Show the format you want for responses
- Break down complex tasks: Split complex analysis into steps
python
# Good prompt example
prompt = """
Analyze this product image and provide:
1. Product name and category
2. Key features visible in the image
3. Target audience
4. Pricing tier estimate (budget/mid-range/premium)
5. Marketing suggestions based on visual appeal
"""
Performance Optimization
- Optimize image size: Resize large images to reduce processing time
- Use appropriate detail level: Choose "low" for simple tasks
- Batch similar requests: Group related image analysis tasks
- Cache results: Store analysis results for repeated queries
Error Handling
python
import base64
from PIL import Image
import io
def safe_image_analysis(image_path, prompt):
try:
# Validate image
with Image.open(image_path) as img:
if img.size[0] > 4096 or img.size[1] > 4096:
# Resize if too large
img.thumbnail((4096, 4096), Image.Resampling.LANCZOS)
# Convert to base64
buffer = io.BytesIO()
img.save(buffer, format='JPEG')
base64_image = base64.b64encode(buffer.getvalue()).decode()
else:
# Use original image
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return {"success": True, "result": response.choices[0].message.content}
except Exception as e:
return {"success": False, "error": str(e)}
Limitations
Current Limitations
- No image generation: The API analyzes images but doesn't create them
- Static images only: No support for video or animated content
- No image editing: Cannot modify or enhance images
- Context window: Images consume significant tokens
Content Restrictions
- No analysis of inappropriate or harmful content
- Limited support for very low-quality or corrupted images
- Cannot identify specific individuals (privacy protection)
Pricing
Token Calculation
Images are converted to tokens based on:
- Image size and resolution
- Detail level selected
- Processing complexity
Cost Estimation
python
def estimate_image_tokens(width, height, detail="auto"):
"""Estimate token usage for an image"""
if detail == "low":
return 85 # Fixed cost for low detail
# High detail calculation
if width > 2048 or height > 2048:
# Resize to fit within 2048x2048
scale = min(2048/width, 2048/height)
width = int(width * scale)
height = int(height * scale)
# Calculate tiles (512x512 each)
tiles_width = (width + 511) // 512
tiles_height = (height + 511) // 512
total_tiles = tiles_width * tiles_height
return 85 + (total_tiles * 170) # Base cost + tile costs
Code Examples
Complete Image Analysis Application
python
import base64
import json
from openai import OpenAI
from PIL import Image
import io
class ImageAnalyzer:
def __init__(self, api_key):
self.client = OpenAI(
api_key=api_key,
base_url="https://api.deepseek.com/v1"
)
def encode_image(self, image_path):
"""Encode image to base64"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_image(self, image_path, prompt, detail="auto"):
"""Analyze an image with a custom prompt"""
try:
base64_image = self.encode_image(image_path)
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": detail
}
}
]
}
],
max_tokens=1000
)
return {
"success": True,
"analysis": response.choices[0].message.content,
"usage": response.usage
}
except Exception as e:
return {"success": False, "error": str(e)}
def extract_text(self, image_path):
"""Extract text from an image (OCR)"""
prompt = "Extract all text from this image. Format it clearly and maintain the original structure."
return self.analyze_image(image_path, prompt)
def describe_image(self, image_path):
"""Generate a detailed description of the image"""
prompt = "Provide a detailed description of this image, including objects, people, setting, colors, and overall composition."
return self.analyze_image(image_path, prompt)
def analyze_chart(self, image_path):
"""Analyze charts and graphs"""
prompt = "Analyze this chart or graph. Describe the data, trends, key insights, and provide a summary of findings."
return self.analyze_image(image_path, prompt, detail="high")
# Usage example
analyzer = ImageAnalyzer("YOUR_API_KEY")
# Analyze an image
result = analyzer.describe_image("path/to/image.jpg")
if result["success"]:
print(result["analysis"])
else:
print(f"Error: {result['error']}")
Batch Image Processing
python
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
def process_images_batch(image_folder, output_file):
"""Process multiple images in parallel"""
analyzer = ImageAnalyzer("YOUR_API_KEY")
results = {}
image_files = [f for f in os.listdir(image_folder)
if f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp'))]
def process_single_image(filename):
image_path = os.path.join(image_folder, filename)
result = analyzer.describe_image(image_path)
return filename, result
# Process images in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_filename = {
executor.submit(process_single_image, filename): filename
for filename in image_files
}
for future in as_completed(future_to_filename):
filename, result = future.result()
results[filename] = result
print(f"Processed: {filename}")
# Save results
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
return results
# Usage
results = process_images_batch("./images", "analysis_results.json")