Skip to content

多模态使用指南

概述

DeepSeek 多模态 AI 能够同时处理文本和图像输入,为您提供强大的视觉理解和分析能力。本指南将帮助您充分利用多模态功能。

支持的模型

deepseek-vl

  • 用途: 图像理解和分析
  • 特点: 高精度视觉识别
  • 适用场景: 图像描述、物体检测、场景分析

deepseek-vl-chat

  • 用途: 对话式图像问答
  • 特点: 自然语言交互
  • 适用场景: 图像问答、视觉助手、教育应用

快速开始

基础图像理解

python
import openai

client = openai.OpenAI(
    api_key="your-api-key",
    base_url="https://api.deepseek.com"
)

response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "请描述这张图片的内容"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://example.com/image.jpg"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)

本地图像处理

python
import base64
import openai

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# 编码图像
base64_image = encode_image("path/to/your/image.jpg")

client = openai.OpenAI(
    api_key="your-api-key",
    base_url="https://api.deepseek.com"
)

response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "分析这张图片中的文字内容"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)

高级用法

多图像对比分析

python
response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "比较这两张图片的差异"
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/image1.jpg"}
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/image2.jpg"}
                }
            ]
        }
    ]
)

对话式图像分析

python
# 第一轮对话
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "这张图片显示的是什么?"
            },
            {
                "type": "image_url",
                "image_url": {"url": "https://example.com/chart.jpg"}
            }
        ]
    }
]

response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=messages
)

# 添加 AI 回复到对话历史
messages.append({
    "role": "assistant",
    "content": response.choices[0].message.content
})

# 继续对话
messages.append({
    "role": "user",
    "content": "能详细解释一下图表中的数据趋势吗?"
})

response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=messages
)

结构化数据提取

python
response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "提取图片中的表格数据,以JSON格式返回"
                },
                {
                    "type": "image_url",
                    "image_url": {"url": "https://example.com/table.jpg"}
                }
            ]
        }
    ],
    response_format={"type": "json_object"}
)

应用场景

1. 文档处理

python
def process_document_image(image_path, task="extract_text"):
    base64_image = encode_image(image_path)
    
    prompts = {
        "extract_text": "提取图片中的所有文字内容",
        "summarize": "总结文档的主要内容",
        "translate": "翻译图片中的文字为中文",
        "format": "将图片中的表格数据整理为结构化格式"
    }
    
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompts.get(task, prompts["extract_text"])
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

2. 教育辅助

python
def educational_assistant(image_path, question):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "system",
                "content": "你是一个教育助手,请根据图片内容回答学生的问题,提供详细的解释和步骤。"
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# 使用示例
answer = educational_assistant(
    "math_problem.jpg", 
    "请解释这道数学题的解题步骤"
)

3. 商品分析

python
def product_analysis(image_path):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """分析这个商品图片,提供以下信息:
                        1. 商品类型和名称
                        2. 主要特征和卖点
                        3. 目标用户群体
                        4. 建议的营销策略
                        请以JSON格式返回结果"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        response_format={"type": "json_object"}
    )
    
    return response.choices[0].message.content

最佳实践

1. 图像预处理

python
from PIL import Image
import io

def optimize_image(image_path, max_size=(1024, 1024), quality=85):
    """优化图像大小和质量"""
    with Image.open(image_path) as img:
        # 转换为RGB模式
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        # 调整大小
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        
        # 保存为base64
        buffer = io.BytesIO()
        img.save(buffer, format='JPEG', quality=quality)
        return base64.b64encode(buffer.getvalue()).decode('utf-8')

2. 批量处理

python
import asyncio
import aiohttp

async def process_images_batch(image_urls, prompt):
    """批量处理多张图片"""
    async def process_single_image(session, url):
        try:
            response = await client.chat.completions.create(
                model="deepseek-vl-chat",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image_url", "image_url": {"url": url}}
                        ]
                    }
                ]
            )
            return {"url": url, "result": response.choices[0].message.content}
        except Exception as e:
            return {"url": url, "error": str(e)}
    
    async with aiohttp.ClientSession() as session:
        tasks = [process_single_image(session, url) for url in image_urls]
        results = await asyncio.gather(*tasks)
    
    return results

3. 错误处理和重试

python
import time
from functools import wraps

def retry_on_error(max_retries=3, delay=1):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if attempt == max_retries - 1:
                        raise e
                    print(f"尝试 {attempt + 1} 失败: {e}")
                    time.sleep(delay * (2 ** attempt))  # 指数退避
            return None
        return wrapper
    return decorator

@retry_on_error(max_retries=3)
def analyze_image_with_retry(image_url, prompt):
    return client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": image_url}}
                ]
            }
        ]
    )

性能优化

1. 图像大小优化

  • 推荐分辨率:1024x1024 以下
  • 文件大小:建议小于 5MB
  • 格式:JPEG 或 PNG

2. 提示词优化

  • 明确具体的任务要求
  • 使用结构化的输出格式
  • 提供必要的上下文信息

3. 并发控制

python
import asyncio
from asyncio import Semaphore

async def process_with_concurrency_limit(image_urls, max_concurrent=5):
    semaphore = Semaphore(max_concurrent)
    
    async def process_with_limit(url):
        async with semaphore:
            return await process_single_image(url)
    
    tasks = [process_with_limit(url) for url in image_urls]
    return await asyncio.gather(*tasks)

常见问题

Q: 支持哪些图像格式?

A: 支持 JPEG、PNG、GIF、WebP 等常见格式。

Q: 图像大小有限制吗?

A: 建议单张图片不超过 20MB,分辨率不超过 4096x4096。

Q: 可以处理多少张图片?

A: 单次请求最多支持 10 张图片。

Q: 如何提高识别准确率?

A:

  • 确保图像清晰度
  • 提供具体的提示词
  • 避免过于复杂的场景

Q: 支持视频处理吗?

A: 目前仅支持静态图像,不支持视频文件。

相关资源

技术支持

如需技术支持,请联系我们的技术支持团队。

基于 DeepSeek AI 大模型技术