多模态使用指南
概述
DeepSeek 多模态 AI 能够同时处理文本和图像输入,为您提供强大的视觉理解和分析能力。本指南将帮助您充分利用多模态功能。
支持的模型
deepseek-vl
- 用途: 图像理解和分析
- 特点: 高精度视觉识别
- 适用场景: 图像描述、物体检测、场景分析
deepseek-vl-chat
- 用途: 对话式图像问答
- 特点: 自然语言交互
- 适用场景: 图像问答、视觉助手、教育应用
快速开始
基础图像理解
python
import openai
client = openai.OpenAI(
api_key="your-api-key",
base_url="https://api.deepseek.com"
)
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "请描述这张图片的内容"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.jpg"
}
}
]
}
]
)
print(response.choices[0].message.content)
本地图像处理
python
import base64
import openai
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# 编码图像
base64_image = encode_image("path/to/your/image.jpg")
client = openai.OpenAI(
api_key="your-api-key",
base_url="https://api.deepseek.com"
)
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "分析这张图片中的文字内容"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
print(response.choices[0].message.content)
高级用法
多图像对比分析
python
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "比较这两张图片的差异"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image1.jpg"}
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image2.jpg"}
}
]
}
]
)
对话式图像分析
python
# 第一轮对话
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "这张图片显示的是什么?"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/chart.jpg"}
}
]
}
]
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=messages
)
# 添加 AI 回复到对话历史
messages.append({
"role": "assistant",
"content": response.choices[0].message.content
})
# 继续对话
messages.append({
"role": "user",
"content": "能详细解释一下图表中的数据趋势吗?"
})
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=messages
)
结构化数据提取
python
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "提取图片中的表格数据,以JSON格式返回"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/table.jpg"}
}
]
}
],
response_format={"type": "json_object"}
)
应用场景
1. 文档处理
python
def process_document_image(image_path, task="extract_text"):
base64_image = encode_image(image_path)
prompts = {
"extract_text": "提取图片中的所有文字内容",
"summarize": "总结文档的主要内容",
"translate": "翻译图片中的文字为中文",
"format": "将图片中的表格数据整理为结构化格式"
}
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompts.get(task, prompts["extract_text"])
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return response.choices[0].message.content
2. 教育辅助
python
def educational_assistant(image_path, question):
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "system",
"content": "你是一个教育助手,请根据图片内容回答学生的问题,提供详细的解释和步骤。"
},
{
"role": "user",
"content": [
{
"type": "text",
"text": question
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
]
)
return response.choices[0].message.content
# 使用示例
answer = educational_assistant(
"math_problem.jpg",
"请解释这道数学题的解题步骤"
)
3. 商品分析
python
def product_analysis(image_path):
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": """分析这个商品图片,提供以下信息:
1. 商品类型和名称
2. 主要特征和卖点
3. 目标用户群体
4. 建议的营销策略
请以JSON格式返回结果"""
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
response_format={"type": "json_object"}
)
return response.choices[0].message.content
最佳实践
1. 图像预处理
python
from PIL import Image
import io
def optimize_image(image_path, max_size=(1024, 1024), quality=85):
"""优化图像大小和质量"""
with Image.open(image_path) as img:
# 转换为RGB模式
if img.mode != 'RGB':
img = img.convert('RGB')
# 调整大小
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# 保存为base64
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=quality)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
2. 批量处理
python
import asyncio
import aiohttp
async def process_images_batch(image_urls, prompt):
"""批量处理多张图片"""
async def process_single_image(session, url):
try:
response = await client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": url}}
]
}
]
)
return {"url": url, "result": response.choices[0].message.content}
except Exception as e:
return {"url": url, "error": str(e)}
async with aiohttp.ClientSession() as session:
tasks = [process_single_image(session, url) for url in image_urls]
results = await asyncio.gather(*tasks)
return results
3. 错误处理和重试
python
import time
from functools import wraps
def retry_on_error(max_retries=3, delay=1):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
if attempt == max_retries - 1:
raise e
print(f"尝试 {attempt + 1} 失败: {e}")
time.sleep(delay * (2 ** attempt)) # 指数退避
return None
return wrapper
return decorator
@retry_on_error(max_retries=3)
def analyze_image_with_retry(image_url, prompt):
return client.chat.completions.create(
model="deepseek-vl-chat",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}}
]
}
]
)
性能优化
1. 图像大小优化
- 推荐分辨率:1024x1024 以下
- 文件大小:建议小于 5MB
- 格式:JPEG 或 PNG
2. 提示词优化
- 明确具体的任务要求
- 使用结构化的输出格式
- 提供必要的上下文信息
3. 并发控制
python
import asyncio
from asyncio import Semaphore
async def process_with_concurrency_limit(image_urls, max_concurrent=5):
semaphore = Semaphore(max_concurrent)
async def process_with_limit(url):
async with semaphore:
return await process_single_image(url)
tasks = [process_with_limit(url) for url in image_urls]
return await asyncio.gather(*tasks)
常见问题
Q: 支持哪些图像格式?
A: 支持 JPEG、PNG、GIF、WebP 等常见格式。
Q: 图像大小有限制吗?
A: 建议单张图片不超过 20MB,分辨率不超过 4096x4096。
Q: 可以处理多少张图片?
A: 单次请求最多支持 10 张图片。
Q: 如何提高识别准确率?
A:
- 确保图像清晰度
- 提供具体的提示词
- 避免过于复杂的场景
Q: 支持视频处理吗?
A: 目前仅支持静态图像,不支持视频文件。
相关资源
技术支持
如需技术支持,请联系我们的技术支持团队。