Multimodal Guide
Learn how to work with images and text using DeepSeek's multimodal capabilities for visual understanding and analysis.
Overview
DeepSeek's multimodal API allows you to:
- Analyze images: Describe, understand, and extract information from images
- Visual Q&A: Ask questions about image content
- OCR capabilities: Extract text from images
- Chart analysis: Understand graphs, charts, and diagrams
- Multi-image processing: Compare and analyze multiple images
- Combined workflows: Integrate text and visual processing
Getting Started
Basic Image Analysis
python
from openai import OpenAI
client = OpenAI(
api_key="sk-your-deepseek-key",
base_url="https://api.deepseek.com/v1"
)
# Analyze an image from URL
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What do you see in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.jpg"
}
}
]
}
]
)
print(response.choices[0].message.content)
Base64 Image Input
python
import base64
from pathlib import Path
def encode_image(image_path: str) -> str:
"""Encode image to base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Load and encode image
image_path = "path/to/your/image.jpg"
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in detail."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
print(response.choices[0].message.content)
Image Input Methods
Supported Formats
python
SUPPORTED_FORMATS = {
"image/jpeg": [".jpg", ".jpeg"],
"image/png": [".png"],
"image/gif": [".gif"],
"image/webp": [".webp"]
}
def validate_image_format(file_path: str) -> bool:
"""Validate if image format is supported"""
file_extension = Path(file_path).suffix.lower()
for mime_type, extensions in SUPPORTED_FORMATS.items():
if file_extension in extensions:
return True
return False
# Usage
image_path = "example.jpg"
if validate_image_format(image_path):
print("✅ Image format supported")
else:
print("❌ Image format not supported")
Image Size Limits
python
from PIL import Image
import os
def check_image_constraints(image_path: str) -> dict:
"""Check if image meets API constraints"""
# File size limit (example: 20MB)
MAX_FILE_SIZE = 20 * 1024 * 1024 # 20MB in bytes
# Get file size
file_size = os.path.getsize(image_path)
# Get image dimensions
with Image.open(image_path) as img:
width, height = img.size
constraints = {
"file_size_ok": file_size <= MAX_FILE_SIZE,
"file_size_mb": file_size / (1024 * 1024),
"dimensions": f"{width}x{height}",
"format": img.format,
"mode": img.mode
}
return constraints
# Usage
constraints = check_image_constraints("example.jpg")
print(f"File size OK: {constraints['file_size_ok']}")
print(f"Size: {constraints['file_size_mb']:.2f} MB")
print(f"Dimensions: {constraints['dimensions']}")
Image Preprocessing
python
from PIL import Image
import io
import base64
class ImageProcessor:
"""Process images for optimal API usage"""
@staticmethod
def resize_image(image_path: str, max_size: tuple = (1024, 1024)) -> str:
"""Resize image while maintaining aspect ratio"""
with Image.open(image_path) as img:
# Convert to RGB if necessary
if img.mode != 'RGB':
img = img.convert('RGB')
# Resize maintaining aspect ratio
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# Save to bytes
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
buffer.seek(0)
# Encode to base64
return base64.b64encode(buffer.getvalue()).decode('utf-8')
@staticmethod
def compress_image(image_path: str, quality: int = 85) -> str:
"""Compress image to reduce file size"""
with Image.open(image_path) as img:
if img.mode != 'RGB':
img = img.convert('RGB')
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=quality, optimize=True)
buffer.seek(0)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
@staticmethod
def prepare_image_for_api(image_path: str, max_size: tuple = (1024, 1024), quality: int = 85) -> str:
"""Prepare image for API with optimal settings"""
# Check if image needs processing
constraints = check_image_constraints(image_path)
if constraints['file_size_mb'] > 10: # If larger than 10MB
print("Compressing large image...")
return ImageProcessor.compress_image(image_path, quality)
with Image.open(image_path) as img:
if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
print("Resizing large image...")
return ImageProcessor.resize_image(image_path, max_size)
# Image is fine as-is
return encode_image(image_path)
# Usage
processor = ImageProcessor()
optimized_image = processor.prepare_image_for_api("large_image.jpg")
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this optimized image."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{optimized_image}"
}
}
]
}
]
)
Advanced Multimodal Techniques
Multi-Image Analysis
python
def analyze_multiple_images(image_paths: list, question: str) -> str:
"""Analyze multiple images in a single request"""
content = [{"type": "text", "text": question}]
for i, image_path in enumerate(image_paths):
base64_image = encode_image(image_path)
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": content
}
]
)
return response.choices[0].message.content
# Usage
image_paths = ["image1.jpg", "image2.jpg", "image3.jpg"]
result = analyze_multiple_images(
image_paths,
"Compare these three images and identify the main differences."
)
print(result)
Image Detail Control
python
def analyze_with_detail_control(image_path: str, detail_level: str = "auto") -> str:
"""Control the level of detail in image analysis"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Provide a detailed analysis of this image."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": detail_level # "low", "high", or "auto"
}
}
]
}
]
)
return response.choices[0].message.content
# Usage examples
result_auto = analyze_with_detail_control("complex_chart.jpg", "auto")
result_high = analyze_with_detail_control("detailed_diagram.jpg", "high")
result_low = analyze_with_detail_control("simple_icon.jpg", "low")
Conversational Image Analysis
python
class ImageConversation:
"""Maintain conversation context with images"""
def __init__(self):
self.messages = []
def add_image_message(self, image_path: str, text: str):
"""Add a message with image"""
base64_image = encode_image(image_path)
self.messages.append({
"role": "user",
"content": [
{
"type": "text",
"text": text
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
})
def add_text_message(self, text: str, role: str = "user"):
"""Add a text-only message"""
self.messages.append({
"role": role,
"content": text
})
def get_response(self) -> str:
"""Get AI response and add to conversation"""
response = client.chat.completions.create(
model="deepseek-chat",
messages=self.messages
)
assistant_message = response.choices[0].message.content
self.add_text_message(assistant_message, "assistant")
return assistant_message
def clear_conversation(self):
"""Clear conversation history"""
self.messages = []
# Usage
conversation = ImageConversation()
# Start with an image
conversation.add_image_message("chart.jpg", "What does this chart show?")
response1 = conversation.get_response()
print("AI:", response1)
# Follow up with text
conversation.add_text_message("What are the key trends you notice?")
response2 = conversation.get_response()
print("AI:", response2)
# Add another image for comparison
conversation.add_image_message("chart2.jpg", "How does this compare to the previous chart?")
response3 = conversation.get_response()
print("AI:", response3)
Specialized Use Cases
OCR and Text Extraction
python
def extract_text_from_image(image_path: str) -> str:
"""Extract text from image using OCR capabilities"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Extract all text from this image. Provide the text exactly as it appears, maintaining formatting where possible."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return response.choices[0].message.content
def extract_structured_text(image_path: str) -> dict:
"""Extract text and organize it into structured format"""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": """Extract text from this image and organize it into a JSON structure with the following format:
{
"title": "main title if any",
"headings": ["list of headings"],
"body_text": "main content",
"tables": ["any tabular data"],
"lists": ["any bullet points or lists"],
"other": "any other text"
}"""
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return response.choices[0].message.content
# Usage
text_content = extract_text_from_image("document.jpg")
structured_content = extract_structured_text("document.jpg")
Chart and Graph Analysis
python
def analyze_chart(image_path: str, analysis_type: str = "comprehensive") -> str:
"""Analyze charts and graphs with specific focus"""
analysis_prompts = {
"comprehensive": "Provide a comprehensive analysis of this chart including data trends, key insights, and conclusions.",
"trends": "Focus on identifying and describing the main trends shown in this chart.",
"data_extraction": "Extract the specific data points and values shown in this chart.",
"insights": "What are the key business insights that can be derived from this chart?",
"comparison": "Compare the different data series or categories shown in this chart."
}
prompt = analysis_prompts.get(analysis_type, analysis_prompts["comprehensive"])
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return response.choices[0].message.content
# Usage examples
comprehensive_analysis = analyze_chart("sales_chart.jpg", "comprehensive")
trend_analysis = analyze_chart("sales_chart.jpg", "trends")
data_extraction = analyze_chart("sales_chart.jpg", "data_extraction")
Visual Question Answering
python
class VisualQA:
"""Visual Question Answering system"""
def __init__(self):
self.image_cache = {}
def load_image(self, image_path: str, image_id: str = None):
"""Load and cache image for multiple questions"""
if image_id is None:
image_id = image_path
self.image_cache[image_id] = encode_image(image_path)
def ask_question(self, question: str, image_id: str) -> str:
"""Ask a question about a cached image"""
if image_id not in self.image_cache:
raise ValueError(f"Image {image_id} not found in cache")
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": question
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{self.image_cache[image_id]}"
}
}
]
}
]
)
return response.choices[0].message.content
def batch_questions(self, questions: list, image_id: str) -> dict:
"""Ask multiple questions about the same image"""
results = {}
for question in questions:
results[question] = self.ask_question(question, image_id)
return results
# Usage
vqa = VisualQA()
# Load image once
vqa.load_image("product_photo.jpg", "product")
# Ask multiple questions
questions = [
"What color is the product?",
"What is the brand name?",
"What are the key features visible?",
"Is there any text or labels?",
"What is the approximate size?"
]
answers = vqa.batch_questions(questions, "product")
for question, answer in answers.items():
print(f"Q: {question}")
print(f"A: {answer}\n")
Document Analysis
python
def analyze_document(image_path: str, document_type: str = "general") -> dict:
"""Analyze different types of documents"""
document_prompts = {
"invoice": """Analyze this invoice and extract:
- Invoice number
- Date
- Vendor information
- Line items with quantities and prices
- Total amount
- Payment terms""",
"receipt": """Analyze this receipt and extract:
- Store name and location
- Date and time
- Items purchased with prices
- Subtotal, tax, and total
- Payment method""",
"contract": """Analyze this contract and identify:
- Parties involved
- Key terms and conditions
- Important dates
- Financial obligations
- Signatures and dates""",
"form": """Analyze this form and extract:
- Form title and purpose
- All filled-in fields and values
- Empty fields that need completion
- Instructions or notes""",
"general": """Analyze this document and provide:
- Document type and purpose
- Key information and data
- Structure and organization
- Important details"""
}
prompt = document_prompts.get(document_type, document_prompts["general"])
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return {
"document_type": document_type,
"analysis": response.choices[0].message.content,
"image_path": image_path
}
# Usage
invoice_analysis = analyze_document("invoice.jpg", "invoice")
receipt_analysis = analyze_document("receipt.jpg", "receipt")
Performance Optimization
Batch Processing
python
import asyncio
from concurrent.futures import ThreadPoolExecutor
import time
class MultimodalBatchProcessor:
"""Process multiple images efficiently"""
def __init__(self, max_workers: int = 5):
self.max_workers = max_workers
def process_single_image(self, image_path: str, prompt: str) -> dict:
"""Process a single image"""
start_time = time.time()
try:
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return {
"image_path": image_path,
"success": True,
"result": response.choices[0].message.content,
"processing_time": time.time() - start_time
}
except Exception as e:
return {
"image_path": image_path,
"success": False,
"error": str(e),
"processing_time": time.time() - start_time
}
def process_batch(self, image_paths: list, prompt: str) -> list:
"""Process multiple images in parallel"""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [
executor.submit(self.process_single_image, image_path, prompt)
for image_path in image_paths
]
results = [future.result() for future in futures]
return results
def process_with_different_prompts(self, image_prompt_pairs: list) -> list:
"""Process images with different prompts"""
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
futures = [
executor.submit(self.process_single_image, image_path, prompt)
for image_path, prompt in image_prompt_pairs
]
results = [future.result() for future in futures]
return results
# Usage
processor = MultimodalBatchProcessor(max_workers=3)
# Process multiple images with same prompt
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
results = processor.process_batch(image_paths, "Describe this image in detail.")
# Process images with different prompts
image_prompt_pairs = [
("chart1.jpg", "Analyze this chart and extract key trends."),
("document1.jpg", "Extract all text from this document."),
("product1.jpg", "Describe this product and its features.")
]
results = processor.process_with_different_prompts(image_prompt_pairs)
# Print results
for result in results:
if result["success"]:
print(f"✅ {result['image_path']}: Processed in {result['processing_time']:.2f}s")
else:
print(f"❌ {result['image_path']}: Error - {result['error']}")
Caching and Optimization
python
import hashlib
import json
from pathlib import Path
class MultimodalCache:
"""Cache multimodal API responses"""
def __init__(self, cache_dir: str = "./cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _get_cache_key(self, image_path: str, prompt: str) -> str:
"""Generate cache key for image and prompt combination"""
# Get image file hash
with open(image_path, "rb") as f:
image_hash = hashlib.md5(f.read()).hexdigest()
# Get prompt hash
prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
return f"{image_hash}_{prompt_hash}"
def get_cached_response(self, image_path: str, prompt: str) -> dict:
"""Get cached response if available"""
cache_key = self._get_cache_key(image_path, prompt)
cache_file = self.cache_dir / f"{cache_key}.json"
if cache_file.exists():
with open(cache_file, "r") as f:
return json.load(f)
return None
def cache_response(self, image_path: str, prompt: str, response: str):
"""Cache API response"""
cache_key = self._get_cache_key(image_path, prompt)
cache_file = self.cache_dir / f"{cache_key}.json"
cache_data = {
"image_path": image_path,
"prompt": prompt,
"response": response,
"timestamp": time.time()
}
with open(cache_file, "w") as f:
json.dump(cache_data, f, indent=2)
def analyze_with_cache(self, image_path: str, prompt: str) -> str:
"""Analyze image with caching"""
# Check cache first
cached = self.get_cached_response(image_path, prompt)
if cached:
print(f"✅ Using cached response for {image_path}")
return cached["response"]
# Make API call
print(f"🔄 Making API call for {image_path}")
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
result = response.choices[0].message.content
# Cache the response
self.cache_response(image_path, prompt, result)
return result
# Usage
cache = MultimodalCache()
# First call - makes API request
result1 = cache.analyze_with_cache("image.jpg", "Describe this image.")
# Second call - uses cache
result2 = cache.analyze_with_cache("image.jpg", "Describe this image.")
Best Practices
Prompt Engineering for Vision
python
VISION_PROMPT_TEMPLATES = {
"detailed_description": """Provide a detailed description of this image including:
- Main subjects and objects
- Colors, lighting, and composition
- Setting and background
- Any text or symbols visible
- Overall mood or atmosphere""",
"technical_analysis": """Analyze this image from a technical perspective:
- Image quality and resolution
- Composition and framing
- Lighting conditions
- Any technical issues or artifacts
- Suggestions for improvement""",
"accessibility_description": """Create an accessibility description for this image:
- Describe all visual elements clearly
- Include spatial relationships
- Mention colors and their significance
- Describe any text or important details
- Keep it concise but comprehensive""",
"data_extraction": """Extract all data and information from this image:
- Any numerical data or statistics
- Text content and labels
- Categorical information
- Relationships between elements
- Structure the output clearly""",
"comparison_analysis": """Compare and analyze the elements in this image:
- Identify different categories or groups
- Compare sizes, colors, or quantities
- Note similarities and differences
- Highlight the most significant findings"""
}
def get_optimized_prompt(task_type: str, custom_instructions: str = "") -> str:
"""Get optimized prompt for specific vision tasks"""
base_prompt = VISION_PROMPT_TEMPLATES.get(task_type, VISION_PROMPT_TEMPLATES["detailed_description"])
if custom_instructions:
return f"{base_prompt}\n\nAdditional instructions: {custom_instructions}"
return base_prompt
# Usage
prompt = get_optimized_prompt("data_extraction", "Focus on financial data and percentages")
Error Handling
python
def robust_image_analysis(image_path: str, prompt: str, max_retries: int = 3) -> dict:
"""Robust image analysis with error handling and retries"""
for attempt in range(max_retries):
try:
# Validate image first
if not Path(image_path).exists():
return {"error": f"Image file not found: {image_path}"}
if not validate_image_format(image_path):
return {"error": f"Unsupported image format: {image_path}"}
# Check image constraints
constraints = check_image_constraints(image_path)
if not constraints["file_size_ok"]:
return {"error": f"Image too large: {constraints['file_size_mb']:.2f} MB"}
# Process image
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
timeout=30 # 30 second timeout
)
return {
"success": True,
"result": response.choices[0].message.content,
"attempt": attempt + 1
}
except Exception as e:
error_msg = str(e)
if attempt == max_retries - 1: # Last attempt
return {
"success": False,
"error": error_msg,
"attempts": max_retries
}
# Wait before retry
time.sleep(2 ** attempt) # Exponential backoff
return {"success": False, "error": "Max retries exceeded"}
# Usage
result = robust_image_analysis("image.jpg", "Describe this image.")
if result.get("success"):
print(f"✅ Success: {result['result']}")
else:
print(f"❌ Error: {result['error']}")
Integration Examples
Web Application Integration
python
from flask import Flask, request, jsonify
import tempfile
import os
app = Flask(__name__)
@app.route('/analyze-image', methods=['POST'])
def analyze_image_endpoint():
"""Web endpoint for image analysis"""
try:
# Get uploaded file
if 'image' not in request.files:
return jsonify({"error": "No image file provided"}), 400
file = request.files['image']
prompt = request.form.get('prompt', 'Describe this image.')
if file.filename == '':
return jsonify({"error": "No file selected"}), 400
# Save temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
file.save(tmp_file.name)
# Analyze image
result = robust_image_analysis(tmp_file.name, prompt)
# Clean up
os.unlink(tmp_file.name)
if result.get("success"):
return jsonify({
"success": True,
"analysis": result["result"]
})
else:
return jsonify({
"success": False,
"error": result["error"]
}), 500
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
app.run(debug=True)
Streamlit Application
python
import streamlit as st
from PIL import Image
import io
st.set_page_config(page_title="Multimodal Image Analyzer", layout="wide")
st.title("🖼️ Multimodal Image Analyzer")
# Sidebar configuration
with st.sidebar:
st.header("Configuration")
analysis_type = st.selectbox(
"Analysis Type",
["General Description", "OCR Text Extraction", "Chart Analysis", "Document Analysis"]
)
custom_prompt = st.text_area(
"Custom Prompt (optional)",
placeholder="Enter your custom analysis prompt..."
)
# Main interface
col1, col2 = st.columns([1, 1])
with col1:
st.header("Upload Image")
uploaded_file = st.file_uploader(
"Choose an image...",
type=['jpg', 'jpeg', 'png', 'gif', 'webp']
)
if uploaded_file is not None:
# Display image
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Save temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
image.save(tmp_file.name, format='JPEG')
temp_path = tmp_file.name
with col2:
st.header("Analysis Results")
if uploaded_file is not None:
if st.button("Analyze Image", type="primary"):
with st.spinner("Analyzing image..."):
# Determine prompt based on analysis type
if custom_prompt:
prompt = custom_prompt
else:
prompt_map = {
"General Description": "Provide a detailed description of this image.",
"OCR Text Extraction": "Extract all text from this image.",
"Chart Analysis": "Analyze this chart and provide insights about the data.",
"Document Analysis": "Analyze this document and extract key information."
}
prompt = prompt_map[analysis_type]
# Analyze image
result = robust_image_analysis(temp_path, prompt)
if result.get("success"):
st.success("Analysis completed!")
st.write(result["result"])
else:
st.error(f"Analysis failed: {result['error']}")
# Clean up
os.unlink(temp_path)
Troubleshooting
Common Issues
python
def diagnose_multimodal_issues(image_path: str, error_message: str) -> list:
"""Diagnose common multimodal API issues"""
issues = []
# Check file existence
if not Path(image_path).exists():
issues.append("❌ Image file does not exist")
return issues
# Check file format
if not validate_image_format(image_path):
issues.append("❌ Unsupported image format")
# Check file size
constraints = check_image_constraints(image_path)
if not constraints["file_size_ok"]:
issues.append(f"❌ Image too large: {constraints['file_size_mb']:.2f} MB")
# Check error message patterns
error_lower = error_message.lower()
if "timeout" in error_lower:
issues.append("❌ Request timeout - try reducing image size")
if "rate limit" in error_lower:
issues.append("❌ Rate limit exceeded - wait before retrying")
if "invalid" in error_lower and "base64" in error_lower:
issues.append("❌ Invalid base64 encoding - check image processing")
if "model" in error_lower:
issues.append("❌ Model error - ensure using 'deepseek-chat'")
if not issues:
issues.append("✅ No obvious issues detected")
return issues
# Usage
issues = diagnose_multimodal_issues("problematic_image.jpg", "Request timeout error")
for issue in issues:
print(issue)