KV Cache Guide
Learn how to leverage DeepSeek's Key-Value (KV) cache optimization to improve performance and reduce latency in your applications.
Overview
KV cache is an optimization technique that stores computed key-value pairs from previous tokens, allowing the model to:
- Reduce computation: Avoid recomputing attention for previous tokens
- Improve latency: Faster response times for long conversations
- Save resources: Lower computational overhead
- Enable efficient streaming: Better performance for real-time applications
How KV Cache Works
Basic Concept
python
# Without KV Cache (inefficient)
# Each request recomputes attention for all previous tokens
conversation = [
"Hello, how are you?",
"I'm fine, thanks. What about you?",
"I'm doing well. Can you help me with Python?"
]
# Each message requires full recomputation
# With KV Cache (efficient)
# Previous computations are cached and reused
# Only new tokens require computation
Cache Lifecycle
python
from openai import OpenAI
client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://api.deepseek.com/v1"
)
# Start a conversation with cache initialization
def start_cached_conversation():
"""Initialize a conversation with KV cache"""
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant."
},
{
"role": "user",
"content": "Hello! I'd like to start a conversation about Python programming."
}
],
# Enable cache optimization
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-TTL": "3600" # Cache for 1 hour
}
)
return response
# Continue conversation using cached context
def continue_conversation(messages, new_message):
"""Continue conversation with cached context"""
messages.append({
"role": "user",
"content": new_message
})
response = client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-Reuse": "true"
}
)
messages.append({
"role": "assistant",
"content": response.choices[0].message.content
})
return response, messages
Cache Management
Cache Configuration
python
class KVCacheManager:
"""Manage KV cache for conversations"""
def __init__(self, client, cache_ttl=3600):
self.client = client
self.cache_ttl = cache_ttl
self.conversations = {}
def create_conversation(self, conversation_id: str, system_prompt: str = None):
"""Create a new conversation with cache"""
messages = []
if system_prompt:
messages.append({
"role": "system",
"content": system_prompt
})
self.conversations[conversation_id] = {
"messages": messages,
"cache_id": f"cache_{conversation_id}",
"created_at": time.time()
}
return conversation_id
def send_message(self, conversation_id: str, message: str):
"""Send message using cached conversation"""
if conversation_id not in self.conversations:
raise ValueError(f"Conversation {conversation_id} not found")
conv = self.conversations[conversation_id]
conv["messages"].append({
"role": "user",
"content": message
})
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=conv["messages"],
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": conv["cache_id"],
"X-Cache-TTL": str(self.cache_ttl)
}
)
assistant_message = response.choices[0].message.content
conv["messages"].append({
"role": "assistant",
"content": assistant_message
})
return assistant_message
def get_conversation(self, conversation_id: str):
"""Get conversation history"""
return self.conversations.get(conversation_id, {}).get("messages", [])
def clear_cache(self, conversation_id: str):
"""Clear cache for a conversation"""
if conversation_id in self.conversations:
del self.conversations[conversation_id]
# Usage example
cache_manager = KVCacheManager(client)
# Create conversation
conv_id = cache_manager.create_conversation(
"user_123",
"You are a Python programming tutor."
)
# Send messages
response1 = cache_manager.send_message(conv_id, "What are Python decorators?")
response2 = cache_manager.send_message(conv_id, "Can you show me an example?")
response3 = cache_manager.send_message(conv_id, "How do I create my own decorator?")
Cache Optimization Strategies
python
class OptimizedCacheManager:
"""Advanced cache management with optimization"""
def __init__(self, client):
self.client = client
self.cache_stats = {}
def adaptive_caching(self, conversation_id: str, messages: list,
message_threshold: int = 5):
"""Enable caching only for longer conversations"""
use_cache = len(messages) >= message_threshold
headers = {}
if use_cache:
headers.update({
"X-Cache-Enable": "true",
"X-Cache-ID": f"adaptive_{conversation_id}",
"X-Cache-Strategy": "adaptive"
})
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers=headers
)
# Track cache usage
self.cache_stats[conversation_id] = {
"cache_enabled": use_cache,
"message_count": len(messages),
"timestamp": time.time()
}
return response
def context_aware_caching(self, messages: list, context_type: str):
"""Optimize caching based on context type"""
cache_strategies = {
"code_review": {
"ttl": 7200, # 2 hours for code discussions
"priority": "high"
},
"casual_chat": {
"ttl": 1800, # 30 minutes for casual conversations
"priority": "low"
},
"technical_support": {
"ttl": 3600, # 1 hour for support
"priority": "medium"
},
"documentation": {
"ttl": 14400, # 4 hours for documentation
"priority": "high"
}
}
strategy = cache_strategies.get(context_type, {
"ttl": 3600,
"priority": "medium"
})
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-TTL": str(strategy["ttl"]),
"X-Cache-Priority": strategy["priority"],
"X-Cache-Context": context_type
}
)
return response
def get_cache_stats(self):
"""Get cache usage statistics"""
return self.cache_stats
# Usage
optimizer = OptimizedCacheManager(client)
# Adaptive caching
messages = [
{"role": "system", "content": "You are a code reviewer."},
{"role": "user", "content": "Please review this Python function..."},
# ... more messages
]
response = optimizer.adaptive_caching("review_session_1", messages)
# Context-aware caching
response = optimizer.context_aware_caching(messages, "code_review")
Performance Optimization
Streaming with KV Cache
python
def stream_with_cache(messages: list, conversation_id: str):
"""Stream responses while maintaining cache"""
response = client.chat.completions.create(
model="deepseek-chat",
messages=messages,
stream=True,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": conversation_id,
"X-Cache-Stream": "true"
}
)
full_response = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_response += content
print(content, end="", flush=True)
return full_response
# Example usage
conversation_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain machine learning in detail."}
]
streamed_response = stream_with_cache(conversation_messages, "ml_explanation")
Batch Processing with Cache
python
def batch_process_with_cache(conversation_batches: list):
"""Process multiple conversations with shared cache"""
results = []
for batch_id, messages in conversation_batches:
response = client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-Batch": "true",
"X-Cache-Batch-ID": batch_id,
"X-Cache-Share": "true" # Share cache across batch
}
)
results.append({
"batch_id": batch_id,
"response": response.choices[0].message.content,
"cache_hit": response.headers.get("X-Cache-Hit", "false")
})
return results
# Example
batches = [
("batch_1", [
{"role": "user", "content": "What is Python?"}
]),
("batch_2", [
{"role": "user", "content": "What are Python's main features?"}
]),
("batch_3", [
{"role": "user", "content": "How do I install Python?"}
])
]
batch_results = batch_process_with_cache(batches)
Cache Monitoring
Performance Metrics
python
class CacheMonitor:
"""Monitor cache performance and usage"""
def __init__(self):
self.metrics = {
"cache_hits": 0,
"cache_misses": 0,
"total_requests": 0,
"avg_response_time": 0,
"cache_size": 0
}
self.response_times = []
def track_request(self, response, start_time):
"""Track request metrics"""
end_time = time.time()
response_time = end_time - start_time
self.response_times.append(response_time)
self.metrics["total_requests"] += 1
# Check if cache was hit
cache_hit = response.headers.get("X-Cache-Hit", "false") == "true"
if cache_hit:
self.metrics["cache_hits"] += 1
else:
self.metrics["cache_misses"] += 1
# Update average response time
self.metrics["avg_response_time"] = sum(self.response_times) / len(self.response_times)
return {
"cache_hit": cache_hit,
"response_time": response_time,
"cache_hit_rate": self.get_cache_hit_rate()
}
def get_cache_hit_rate(self):
"""Calculate cache hit rate"""
total = self.metrics["cache_hits"] + self.metrics["cache_misses"]
if total == 0:
return 0
return self.metrics["cache_hits"] / total
def get_performance_report(self):
"""Generate performance report"""
return {
"cache_hit_rate": f"{self.get_cache_hit_rate():.2%}",
"avg_response_time": f"{self.metrics['avg_response_time']:.3f}s",
"total_requests": self.metrics["total_requests"],
"cache_efficiency": "High" if self.get_cache_hit_rate() > 0.7 else "Medium" if self.get_cache_hit_rate() > 0.4 else "Low"
}
# Usage
monitor = CacheMonitor()
def monitored_request(messages, conversation_id):
"""Make request with monitoring"""
start_time = time.time()
response = client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": conversation_id
}
)
metrics = monitor.track_request(response, start_time)
print(f"Cache hit: {metrics['cache_hit']}, Response time: {metrics['response_time']:.3f}s")
return response
# Example usage
for i in range(10):
response = monitored_request([
{"role": "user", "content": f"Question {i}: What is AI?"}
], "ai_questions")
print("Performance Report:", monitor.get_performance_report())
Cache Analytics
python
class CacheAnalytics:
"""Advanced cache analytics and insights"""
def __init__(self):
self.cache_data = []
self.conversation_patterns = {}
def analyze_conversation_pattern(self, conversation_id: str, messages: list):
"""Analyze conversation patterns for cache optimization"""
pattern = {
"message_count": len(messages),
"avg_message_length": sum(len(msg["content"]) for msg in messages) / len(messages),
"conversation_type": self.classify_conversation(messages),
"cache_potential": self.calculate_cache_potential(messages)
}
self.conversation_patterns[conversation_id] = pattern
return pattern
def classify_conversation(self, messages: list):
"""Classify conversation type for cache strategy"""
content = " ".join([msg["content"] for msg in messages])
if any(keyword in content.lower() for keyword in ["code", "function", "class", "import"]):
return "programming"
elif any(keyword in content.lower() for keyword in ["explain", "what is", "how does"]):
return "educational"
elif any(keyword in content.lower() for keyword in ["help", "problem", "error", "issue"]):
return "support"
else:
return "general"
def calculate_cache_potential(self, messages: list):
"""Calculate potential cache benefit"""
# Higher potential for longer conversations
length_score = min(len(messages) / 10, 1.0)
# Higher potential for repetitive patterns
content_similarity = self.calculate_content_similarity(messages)
# Combined score
potential = (length_score * 0.6) + (content_similarity * 0.4)
return potential
def calculate_content_similarity(self, messages: list):
"""Calculate content similarity for cache potential"""
if len(messages) < 2:
return 0
# Simple similarity based on common words
all_words = []
for msg in messages:
words = msg["content"].lower().split()
all_words.extend(words)
word_freq = {}
for word in all_words:
word_freq[word] = word_freq.get(word, 0) + 1
# Calculate similarity score
repeated_words = sum(1 for freq in word_freq.values() if freq > 1)
total_unique_words = len(word_freq)
if total_unique_words == 0:
return 0
return repeated_words / total_unique_words
def get_optimization_recommendations(self, conversation_id: str):
"""Get cache optimization recommendations"""
pattern = self.conversation_patterns.get(conversation_id)
if not pattern:
return "No data available for recommendations"
recommendations = []
if pattern["cache_potential"] > 0.7:
recommendations.append("High cache potential - enable aggressive caching")
elif pattern["cache_potential"] > 0.4:
recommendations.append("Medium cache potential - use standard caching")
else:
recommendations.append("Low cache potential - consider minimal caching")
if pattern["conversation_type"] == "programming":
recommendations.append("Use extended TTL for code-related conversations")
elif pattern["conversation_type"] == "support":
recommendations.append("Use shorter TTL for support conversations")
if pattern["message_count"] > 20:
recommendations.append("Consider conversation chunking for very long conversations")
return recommendations
# Usage
analytics = CacheAnalytics()
# Analyze conversation
conversation_messages = [
{"role": "user", "content": "What is Python?"},
{"role": "assistant", "content": "Python is a programming language..."},
{"role": "user", "content": "How do I write a function in Python?"},
{"role": "assistant", "content": "To write a function in Python..."}
]
pattern = analytics.analyze_conversation_pattern("python_tutorial", conversation_messages)
recommendations = analytics.get_optimization_recommendations("python_tutorial")
print("Conversation Pattern:", pattern)
print("Recommendations:", recommendations)
Advanced Use Cases
Multi-User Cache Sharing
python
class SharedCacheManager:
"""Manage shared cache across multiple users"""
def __init__(self, client):
self.client = client
self.shared_contexts = {}
def create_shared_context(self, context_id: str, base_prompt: str,
allowed_users: list = None):
"""Create a shared context that multiple users can use"""
self.shared_contexts[context_id] = {
"base_prompt": base_prompt,
"allowed_users": allowed_users or [],
"usage_count": 0,
"created_at": time.time()
}
# Initialize shared cache
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[{"role": "system", "content": base_prompt}],
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": f"shared_{context_id}",
"X-Cache-Shared": "true",
"X-Cache-TTL": "7200" # 2 hours for shared contexts
}
)
return context_id
def use_shared_context(self, context_id: str, user_id: str, message: str):
"""Use shared context for user request"""
if context_id not in self.shared_contexts:
raise ValueError(f"Shared context {context_id} not found")
context = self.shared_contexts[context_id]
# Check user permissions
if context["allowed_users"] and user_id not in context["allowed_users"]:
raise PermissionError(f"User {user_id} not allowed to use context {context_id}")
messages = [
{"role": "system", "content": context["base_prompt"]},
{"role": "user", "content": message}
]
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": f"shared_{context_id}",
"X-Cache-User": user_id,
"X-Cache-Reuse": "true"
}
)
context["usage_count"] += 1
return response.choices[0].message.content
# Example usage
shared_manager = SharedCacheManager(client)
# Create shared context for code review
context_id = shared_manager.create_shared_context(
"code_review_team",
"You are an expert code reviewer. Provide detailed, constructive feedback on code submissions.",
allowed_users=["dev1", "dev2", "dev3", "lead1"]
)
# Multiple users can use the same context
review1 = shared_manager.use_shared_context(context_id, "dev1", "Please review this Python function...")
review2 = shared_manager.use_shared_context(context_id, "dev2", "Can you check this JavaScript code...")
Cache Warming
python
class CacheWarmer:
"""Pre-warm cache with common queries"""
def __init__(self, client):
self.client = client
self.warm_cache_patterns = {}
def define_warm_pattern(self, pattern_id: str, base_messages: list,
variations: list):
"""Define a pattern for cache warming"""
self.warm_cache_patterns[pattern_id] = {
"base_messages": base_messages,
"variations": variations,
"warmed_at": None
}
def warm_cache(self, pattern_id: str):
"""Warm cache with predefined patterns"""
if pattern_id not in self.warm_cache_patterns:
raise ValueError(f"Pattern {pattern_id} not found")
pattern = self.warm_cache_patterns[pattern_id]
# Warm cache with base pattern
base_response = self.client.chat.completions.create(
model="deepseek-chat",
messages=pattern["base_messages"],
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": f"warm_{pattern_id}",
"X-Cache-Warm": "true"
}
)
# Warm cache with variations
for i, variation in enumerate(pattern["variations"]):
messages = pattern["base_messages"] + [
{"role": "user", "content": variation}
]
self.client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": f"warm_{pattern_id}_var_{i}",
"X-Cache-Warm": "true"
}
)
pattern["warmed_at"] = time.time()
return f"Cache warmed for pattern {pattern_id}"
def warm_all_patterns(self):
"""Warm cache for all defined patterns"""
results = []
for pattern_id in self.warm_cache_patterns:
try:
result = self.warm_cache(pattern_id)
results.append(f"✓ {result}")
except Exception as e:
results.append(f"✗ Failed to warm {pattern_id}: {e}")
return results
# Example usage
warmer = CacheWarmer(client)
# Define common patterns
warmer.define_warm_pattern(
"python_help",
[{"role": "system", "content": "You are a Python programming assistant."}],
[
"What is a list comprehension?",
"How do I handle exceptions?",
"What are decorators?",
"How do I work with files?",
"What is the difference between lists and tuples?"
]
)
warmer.define_warm_pattern(
"general_qa",
[{"role": "system", "content": "You are a helpful assistant."}],
[
"What is artificial intelligence?",
"How does machine learning work?",
"What is the difference between AI and ML?",
"What are neural networks?",
"How do I get started with programming?"
]
)
# Warm all caches
warm_results = warmer.warm_all_patterns()
for result in warm_results:
print(result)
Best Practices
Cache Strategy Guidelines
- Enable for long conversations: Use cache for conversations with 5+ messages
- Set appropriate TTL: Match cache lifetime to conversation type
- Monitor performance: Track cache hit rates and response times
- Use shared contexts: Share cache for similar use cases
- Implement cache warming: Pre-warm for common queries
Performance Tips
python
# ✅ Good: Enable cache for long conversations
if len(messages) >= 5:
headers["X-Cache-Enable"] = "true"
# ✅ Good: Use context-specific TTL
ttl_by_context = {
"code_review": 7200, # 2 hours
"casual_chat": 1800, # 30 minutes
"documentation": 14400 # 4 hours
}
# ✅ Good: Monitor cache performance
def track_cache_metrics(response):
cache_hit = response.headers.get("X-Cache-Hit") == "true"
response_time = response.headers.get("X-Response-Time")
return {"cache_hit": cache_hit, "response_time": response_time}
# ❌ Bad: Always enabling cache
headers["X-Cache-Enable"] = "true" # Wasteful for short conversations
# ❌ Bad: Using same TTL for all contexts
headers["X-Cache-TTL"] = "3600" # One size doesn't fit all
Troubleshooting
Common Issues
- Cache misses: Check conversation ID consistency
- Stale responses: Verify TTL settings
- Memory issues: Monitor cache size and cleanup
- Performance degradation: Check cache hit rates
Debug Tools
python
def debug_cache_behavior(messages, conversation_id):
"""Debug cache behavior for troubleshooting"""
print(f"Debugging cache for conversation: {conversation_id}")
print(f"Message count: {len(messages)}")
response = client.chat.completions.create(
model="deepseek-chat",
messages=messages,
extra_headers={
"X-Cache-Enable": "true",
"X-Cache-ID": conversation_id,
"X-Cache-Debug": "true"
}
)
# Print cache debug info
cache_headers = {
k: v for k, v in response.headers.items()
if k.startswith("X-Cache")
}
print("Cache Headers:", cache_headers)
print("Response:", response.choices[0].message.content[:100] + "...")
return response
# Usage
debug_response = debug_cache_behavior(
[{"role": "user", "content": "Test message"}],
"debug_conversation"
)