Skip to content

多模态 API

DeepSeek 多模态 API 支持文本和图像的混合输入,让您能够构建更丰富的 AI 应用。

概述

多模态 API 允许您:

  • 同时处理文本和图像输入
  • 进行图像理解和分析
  • 实现视觉问答功能
  • 生成基于图像的文本描述

支持的模型

模型名称描述支持格式
deepseek-vl视觉语言模型文本 + 图像
deepseek-vl-chat对话式视觉模型文本 + 图像

基本用法

图像理解

python
import openai
import base64

client = openai.OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.deepseek.com"
)

# 读取并编码图像
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

base64_image = encode_image("path/to/your/image.jpg")

response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "这张图片里有什么?"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ],
    max_tokens=1000
)

print(response.choices[0].message.content)

使用图像 URL

python
response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "描述这张图片的内容"
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://example.com/image.jpg"
                    }
                }
            ]
        }
    ]
)

Node.js 示例

javascript
const OpenAI = require('openai');
const fs = require('fs');

const client = new OpenAI({
    apiKey: 'YOUR_API_KEY',
    baseURL: 'https://api.deepseek.com'
});

// 编码图像
function encodeImage(imagePath) {
    const imageBuffer = fs.readFileSync(imagePath);
    return imageBuffer.toString('base64');
}

async function analyzeImage() {
    const base64Image = encodeImage('path/to/image.jpg');
    
    const response = await client.chat.completions.create({
        model: 'deepseek-vl-chat',
        messages: [
            {
                role: 'user',
                content: [
                    {
                        type: 'text',
                        text: '分析这张图片中的物体和场景'
                    },
                    {
                        type: 'image_url',
                        image_url: {
                            url: `data:image/jpeg;base64,${base64Image}`
                        }
                    }
                ]
            }
        ],
        max_tokens: 1000
    });
    
    console.log(response.choices[0].message.content);
}

analyzeImage();

高级功能

多图像分析

python
response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "比较这两张图片的差异"
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image1_base64}"}
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image2_base64}"}
                }
            ]
        }
    ]
)

图像问答对话

python
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "这张图片里有什么动物?"
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
            }
        ]
    }
]

# 第一轮对话
response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=messages
)

# 添加助手回复
messages.append({
    "role": "assistant",
    "content": response.choices[0].message.content
})

# 继续对话
messages.append({
    "role": "user",
    "content": "它们在做什么?"
})

response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=messages
)

支持的图像格式

  • JPEG (.jpg, .jpeg)
  • PNG (.png)
  • GIF (.gif)
  • WebP (.webp)
  • BMP (.bmp)

图像限制

限制项
最大文件大小20MB
最大分辨率4096x4096
每次请求最大图像数10张
支持的颜色模式RGB, RGBA

参数说明

图像相关参数

python
{
    "type": "image_url",
    "image_url": {
        "url": "图像URL或base64编码",
        "detail": "low|high"  # 图像处理精度
    }
}
  • detail:
    • low: 低精度处理,更快但精度较低
    • high: 高精度处理,更准确但速度较慢

模型参数

python
response = client.chat.completions.create(
    model="deepseek-vl-chat",
    messages=messages,
    max_tokens=1000,        # 最大输出token数
    temperature=0.7,        # 创造性控制
    top_p=0.9,             # 核采样参数
    stream=False           # 是否流式输出
)

最佳实践

1. 图像预处理

python
from PIL import Image
import io

def optimize_image(image_path, max_size=(1024, 1024), quality=85):
    """优化图像大小和质量"""
    with Image.open(image_path) as img:
        # 调整大小
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        
        # 转换为RGB(如果是RGBA)
        if img.mode == 'RGBA':
            img = img.convert('RGB')
        
        # 保存为优化后的JPEG
        buffer = io.BytesIO()
        img.save(buffer, format='JPEG', quality=quality, optimize=True)
        return base64.b64encode(buffer.getvalue()).decode('utf-8')

2. 错误处理

python
try:
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=messages
    )
except openai.BadRequestError as e:
    if "image" in str(e).lower():
        print("图像格式或大小不支持")
    else:
        print(f"请求错误: {e}")
except openai.RateLimitError:
    print("请求频率超限,请稍后重试")
except Exception as e:
    print(f"未知错误: {e}")

3. 批量图像处理

python
import asyncio
import aiohttp

async def process_images_batch(image_urls, prompt):
    """批量处理多张图像"""
    tasks = []
    
    for url in image_urls:
        task = process_single_image(url, prompt)
        tasks.append(task)
    
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

async def process_single_image(image_url, prompt):
    """处理单张图像"""
    try:
        response = await client.chat.completions.create(
            model="deepseek-vl-chat",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": image_url}}
                    ]
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"处理失败: {e}"

常见用例

1. 图像描述生成

python
def describe_image(image_path):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "请详细描述这张图片的内容,包括物体、场景、颜色、构图等"
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

2. OCR 文字识别

python
def extract_text_from_image(image_path):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "请识别并提取图片中的所有文字内容"
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

3. 图表数据分析

python
def analyze_chart(image_path):
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="deepseek-vl-chat",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "分析这个图表,提取关键数据和趋势"
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

错误代码

错误代码描述解决方案
invalid_image_format不支持的图像格式使用支持的格式
image_too_large图像文件过大压缩图像或降低分辨率
invalid_base64Base64 编码错误检查编码过程
too_many_images图像数量超限减少单次请求的图像数量

定价

多模态 API 按输入和输出 token 计费:

项目价格
输入 token¥0.0014/1K tokens
输出 token¥0.0028/1K tokens
图像处理¥0.01/张

相关资源

基于 DeepSeek AI 大模型技术