Long Text Processing Guide
Learn how to effectively handle long documents, large datasets, and extended conversations with DeepSeek's advanced text processing capabilities.
Overview
DeepSeek supports processing of long texts with:
- Extended context length: Up to 128K tokens for comprehensive analysis
- Efficient chunking: Smart text segmentation strategies
- Context preservation: Maintain coherence across long documents
- Memory optimization: Efficient handling of large inputs
- Streaming support: Process long texts with real-time output
Context Length Limits
Understanding Token Limits
python
import tiktoken
from openai import OpenAI
client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://api.deepseek.com/v1"
)
def count_tokens(text: str, model: str = "deepseek-chat") -> int:
"""Count tokens in text for the specified model"""
# Use tiktoken for token counting
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Similar tokenization
tokens = encoding.encode(text)
return len(tokens)
def check_context_limit(messages: list, model: str = "deepseek-chat") -> dict:
"""Check if messages fit within context limit"""
# Model limits
limits = {
"deepseek-chat": 128000, # 128K tokens
"deepseek-coder": 128000,
"deepseek-math": 128000
}
total_tokens = 0
for message in messages:
total_tokens += count_tokens(message["content"])
limit = limits.get(model, 128000)
return {
"total_tokens": total_tokens,
"limit": limit,
"within_limit": total_tokens <= limit,
"usage_percentage": (total_tokens / limit) * 100,
"remaining_tokens": limit - total_tokens
}
# Example usage
long_document = """
[Your long document content here...]
"""
messages = [
{"role": "system", "content": "You are a document analysis assistant."},
{"role": "user", "content": f"Please analyze this document:\n\n{long_document}"}
]
context_info = check_context_limit(messages)
print(f"Token usage: {context_info['total_tokens']}/{context_info['limit']} ({context_info['usage_percentage']:.1f}%)")
Model-Specific Limits
python
class ContextManager:
"""Manage context limits for different models"""
MODEL_LIMITS = {
"deepseek-chat": 128000,
"deepseek-coder": 128000,
"deepseek-math": 128000
}
def __init__(self, model: str = "deepseek-chat"):
self.model = model
self.limit = self.MODEL_LIMITS.get(model, 128000)
def can_fit(self, text: str) -> bool:
"""Check if text fits within model limit"""
return count_tokens(text) <= self.limit
def get_max_input_size(self, reserved_for_response: int = 4000) -> int:
"""Get maximum input size, reserving tokens for response"""
return self.limit - reserved_for_response
def estimate_response_tokens(self, input_tokens: int, task_type: str = "general") -> int:
"""Estimate response tokens based on task type"""
estimates = {
"summary": input_tokens * 0.1, # 10% of input
"analysis": input_tokens * 0.3, # 30% of input
"translation": input_tokens * 1.2, # 120% of input
"code_generation": input_tokens * 0.5, # 50% of input
"general": input_tokens * 0.2 # 20% of input
}
return int(estimates.get(task_type, estimates["general"]))
# Usage
manager = ContextManager("deepseek-chat")
max_input = manager.get_max_input_size()
print(f"Maximum input size: {max_input} tokens")
Text Chunking Strategies
Smart Chunking
python
import re
from typing import List, Tuple
class SmartChunker:
"""Intelligent text chunking with context preservation"""
def __init__(self, max_chunk_size: int = 8000, overlap_size: int = 200):
self.max_chunk_size = max_chunk_size
self.overlap_size = overlap_size
def chunk_by_paragraphs(self, text: str) -> List[str]:
"""Chunk text by paragraphs, respecting natural boundaries"""
# Split by double newlines (paragraphs)
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for paragraph in paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
# Check if adding this paragraph exceeds limit
test_chunk = current_chunk + "\n\n" + paragraph if current_chunk else paragraph
if count_tokens(test_chunk) <= self.max_chunk_size:
current_chunk = test_chunk
else:
# Save current chunk and start new one
if current_chunk:
chunks.append(current_chunk)
current_chunk = paragraph
# Add the last chunk
if current_chunk:
chunks.append(current_chunk)
return chunks
def chunk_by_sentences(self, text: str) -> List[str]:
"""Chunk text by sentences for better coherence"""
# Split by sentence endings
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
test_chunk = current_chunk + " " + sentence if current_chunk else sentence
if count_tokens(test_chunk) <= self.max_chunk_size:
current_chunk = test_chunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
def chunk_with_overlap(self, text: str) -> List[Tuple[str, int, int]]:
"""Chunk text with overlapping sections for context preservation"""
chunks = []
start = 0
text_length = len(text)
while start < text_length:
# Calculate end position
end = min(start + self.max_chunk_size, text_length)
# Find a good breaking point (sentence or paragraph end)
if end < text_length:
# Look for sentence ending within last 200 characters
search_start = max(end - 200, start)
sentence_end = text.rfind('.', search_start, end)
if sentence_end > start:
end = sentence_end + 1
chunk = text[start:end].strip()
chunks.append((chunk, start, end))
# Move start position with overlap
start = max(start + self.max_chunk_size - self.overlap_size, end)
return chunks
def chunk_by_structure(self, text: str, structure_markers: List[str] = None) -> List[str]:
"""Chunk text based on structural markers (headers, sections)"""
if structure_markers is None:
structure_markers = [
r'^#+ ', # Markdown headers
r'^\d+\.', # Numbered sections
r'^Chapter ', # Chapter markers
r'^Section ', # Section markers
]
# Find all structure markers
markers = []
for i, line in enumerate(text.split('\n')):
for pattern in structure_markers:
if re.match(pattern, line):
markers.append(i)
break
if not markers:
# No structure found, fall back to paragraph chunking
return self.chunk_by_paragraphs(text)
lines = text.split('\n')
chunks = []
for i in range(len(markers)):
start_line = markers[i]
end_line = markers[i + 1] if i + 1 < len(markers) else len(lines)
section = '\n'.join(lines[start_line:end_line]).strip()
# If section is too large, further chunk it
if count_tokens(section) > self.max_chunk_size:
sub_chunks = self.chunk_by_paragraphs(section)
chunks.extend(sub_chunks)
else:
chunks.append(section)
return chunks
# Example usage
chunker = SmartChunker(max_chunk_size=6000, overlap_size=300)
long_text = """
Your very long document content here...
Multiple paragraphs, sections, etc.
"""
# Different chunking strategies
paragraph_chunks = chunker.chunk_by_paragraphs(long_text)
sentence_chunks = chunker.chunk_by_sentences(long_text)
overlap_chunks = chunker.chunk_with_overlap(long_text)
structure_chunks = chunker.chunk_by_structure(long_text)
print(f"Paragraph chunks: {len(paragraph_chunks)}")
print(f"Sentence chunks: {len(sentence_chunks)}")
print(f"Overlap chunks: {len(overlap_chunks)}")
print(f"Structure chunks: {len(structure_chunks)}")
Semantic Chunking
python
class SemanticChunker:
"""Chunk text based on semantic similarity"""
def __init__(self, client, max_chunk_size: int = 6000):
self.client = client
self.max_chunk_size = max_chunk_size
def get_semantic_boundaries(self, text: str) -> List[int]:
"""Find semantic boundaries in text"""
paragraphs = text.split('\n\n')
boundaries = []
for i in range(len(paragraphs) - 1):
# Get semantic similarity between adjacent paragraphs
similarity = self.calculate_semantic_similarity(
paragraphs[i],
paragraphs[i + 1]
)
# If similarity is low, it's a good boundary
if similarity < 0.5: # Threshold for semantic break
boundaries.append(i + 1)
return boundaries
def calculate_semantic_similarity(self, text1: str, text2: str) -> float:
"""Calculate semantic similarity between two texts"""
prompt = f"""
Rate the semantic similarity between these two text segments on a scale of 0.0 to 1.0:
Text 1: {text1[:500]}...
Text 2: {text2[:500]}...
Return only a number between 0.0 and 1.0, where:
- 0.0 = completely different topics
- 1.0 = same topic and context
Similarity score:"""
try:
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "user", "content": prompt}
],
max_tokens=10,
temperature=0
)
score_text = response.choices[0].message.content.strip()
return float(score_text)
except:
return 0.5 # Default similarity if calculation fails
def chunk_semantically(self, text: str) -> List[str]:
"""Chunk text based on semantic boundaries"""
paragraphs = text.split('\n\n')
boundaries = self.get_semantic_boundaries(text)
chunks = []
start = 0
for boundary in boundaries + [len(paragraphs)]:
chunk_paragraphs = paragraphs[start:boundary]
chunk = '\n\n'.join(chunk_paragraphs)
# If chunk is too large, split it further
if count_tokens(chunk) > self.max_chunk_size:
sub_chunker = SmartChunker(self.max_chunk_size)
sub_chunks = sub_chunker.chunk_by_paragraphs(chunk)
chunks.extend(sub_chunks)
else:
chunks.append(chunk)
start = boundary
return chunks
# Usage (requires API calls, use sparingly)
# semantic_chunker = SemanticChunker(client)
# semantic_chunks = semantic_chunker.chunk_semantically(long_text)
Document Processing Workflows
Sequential Processing
python
class DocumentProcessor:
"""Process long documents sequentially with context preservation"""
def __init__(self, client, chunker: SmartChunker = None):
self.client = client
self.chunker = chunker or SmartChunker()
def process_document_sequentially(self, document: str, task: str,
context_preservation: bool = True) -> List[str]:
"""Process document in chunks while preserving context"""
chunks = self.chunker.chunk_by_paragraphs(document)
results = []
previous_context = ""
for i, chunk in enumerate(chunks):
# Build context-aware prompt
if context_preservation and previous_context:
prompt = f"""
Previous context: {previous_context[-500:]}...
Current section: {chunk}
Task: {task}
Please process the current section while considering the previous context.
"""
else:
prompt = f"""
Section {i+1} of {len(chunks)}:
{chunk}
Task: {task}
"""
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a document processing assistant."},
{"role": "user", "content": prompt}
]
)
result = response.choices[0].message.content
results.append(result)
# Update context for next iteration
if context_preservation:
previous_context = chunk[-300:] + " " + result[-200:]
return results
def summarize_long_document(self, document: str, summary_type: str = "comprehensive") -> str:
"""Create a summary of a long document"""
# First pass: summarize each chunk
chunk_summaries = self.process_document_sequentially(
document,
f"Create a {summary_type} summary of this section"
)
# Second pass: combine summaries
combined_summaries = "\n\n".join(chunk_summaries)
if count_tokens(combined_summaries) > 8000:
# If combined summaries are still too long, summarize again
final_summary_chunks = self.chunker.chunk_by_paragraphs(combined_summaries)
final_summaries = []
for chunk in final_summary_chunks:
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a summarization expert."},
{"role": "user", "content": f"Condense this summary further:\n\n{chunk}"}
]
)
final_summaries.append(response.choices[0].message.content)
combined_summaries = "\n\n".join(final_summaries)
# Final consolidation
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a document summarization expert."},
{"role": "user", "content": f"""
Create a final {summary_type} summary from these section summaries:
{combined_summaries}
Ensure the final summary is coherent, comprehensive, and well-structured.
"""}
]
)
return response.choices[0].message.content
def extract_key_information(self, document: str, extraction_criteria: List[str]) -> dict:
"""Extract specific information from long document"""
criteria_text = "\n".join([f"- {criterion}" for criterion in extraction_criteria])
chunks = self.chunker.chunk_by_paragraphs(document)
extracted_info = {criterion: [] for criterion in extraction_criteria}
for chunk in chunks:
prompt = f"""
Extract the following information from this text section:
{criteria_text}
Text section:
{chunk}
Return the information in a structured format. If information is not found, indicate "Not found".
"""
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are an information extraction specialist."},
{"role": "user", "content": prompt}
],
response_format={"type": "json_object"}
)
try:
chunk_info = json.loads(response.choices[0].message.content)
for criterion in extraction_criteria:
if criterion in chunk_info and chunk_info[criterion] != "Not found":
extracted_info[criterion].append(chunk_info[criterion])
except:
continue
# Consolidate extracted information
consolidated = {}
for criterion, values in extracted_info.items():
if values:
consolidated[criterion] = list(set(values)) # Remove duplicates
else:
consolidated[criterion] = "Not found"
return consolidated
# Example usage
processor = DocumentProcessor(client)
# Summarize long document
summary = processor.summarize_long_document(
long_document,
summary_type="executive"
)
# Extract specific information
key_info = processor.extract_key_information(
long_document,
[
"main_conclusions",
"key_statistics",
"recommendations",
"important_dates",
"mentioned_companies"
]
)
print("Summary:", summary)
print("Key Information:", key_info)
Parallel Processing
python
import asyncio
import aiohttp
from typing import List, Dict, Any
class ParallelDocumentProcessor:
"""Process document chunks in parallel for faster processing"""
def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
self.api_key = api_key
self.base_url = base_url
self.chunker = SmartChunker()
async def process_chunk_async(self, session: aiohttp.ClientSession,
chunk: str, task: str, chunk_id: int) -> Dict[str, Any]:
"""Process a single chunk asynchronously"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": "You are a document processing assistant."},
{"role": "user", "content": f"Chunk {chunk_id}: {task}\n\n{chunk}"}
]
}
async with session.post(
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
) as response:
result = await response.json()
return {
"chunk_id": chunk_id,
"result": result["choices"][0]["message"]["content"],
"chunk": chunk[:100] + "..." # Store snippet for reference
}
async def process_document_parallel(self, document: str, task: str,
max_concurrent: int = 5) -> List[Dict[str, Any]]:
"""Process document chunks in parallel"""
chunks = self.chunker.chunk_by_paragraphs(document)
# Create semaphore to limit concurrent requests
semaphore = asyncio.Semaphore(max_concurrent)
async def process_with_semaphore(session, chunk, task, chunk_id):
async with semaphore:
return await self.process_chunk_async(session, chunk, task, chunk_id)
async with aiohttp.ClientSession() as session:
tasks = [
process_with_semaphore(session, chunk, task, i)
for i, chunk in enumerate(chunks)
]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out exceptions and sort by chunk_id
valid_results = [r for r in results if not isinstance(r, Exception)]
valid_results.sort(key=lambda x: x["chunk_id"])
return valid_results
def run_parallel_processing(self, document: str, task: str) -> List[str]:
"""Run parallel processing (synchronous wrapper)"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
results = loop.run_until_complete(
self.process_document_parallel(document, task)
)
return [result["result"] for result in results]
finally:
loop.close()
# Example usage
# parallel_processor = ParallelDocumentProcessor("YOUR_API_KEY")
# parallel_results = parallel_processor.run_parallel_processing(
# long_document,
# "Analyze the main themes and arguments in this section"
# )
Memory-Efficient Processing
Streaming Long Text
python
class StreamingProcessor:
"""Process long texts with streaming for memory efficiency"""
def __init__(self, client):
self.client = client
def stream_long_text_analysis(self, text: str, analysis_type: str):
"""Stream analysis of long text"""
prompt = f"""
Perform {analysis_type} analysis of this text. Stream your response as you analyze:
{text}
"""
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a text analysis expert. Provide streaming analysis."},
{"role": "user", "content": prompt}
],
stream=True
)
full_analysis = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
full_analysis += content
print(content, end="", flush=True)
return full_analysis
def progressive_summarization(self, text: str, target_length: int = 500):
"""Progressively summarize text to target length"""
current_text = text
iteration = 1
while count_tokens(current_text) > target_length:
print(f"Summarization iteration {iteration}...")
# Calculate compression ratio needed
current_tokens = count_tokens(current_text)
compression_ratio = target_length / current_tokens
prompt = f"""
Summarize this text to approximately {int(current_tokens * compression_ratio)} tokens:
{current_text}
Maintain key information and structure while reducing length.
"""
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a summarization expert."},
{"role": "user", "content": prompt}
],
stream=True
)
current_text = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
current_text += content
print(content, end="", flush=True)
print(f"\nIteration {iteration} complete. Tokens: {count_tokens(current_text)}")
iteration += 1
if iteration > 5: # Prevent infinite loops
break
return current_text
# Example usage
streaming_processor = StreamingProcessor(client)
# Stream analysis
print("Streaming analysis:")
analysis = streaming_processor.stream_long_text_analysis(
long_document,
"thematic"
)
# Progressive summarization
print("\nProgressive summarization:")
final_summary = streaming_processor.progressive_summarization(
long_document,
target_length=300
)
Batch Processing
python
class BatchProcessor:
"""Process multiple long documents efficiently"""
def __init__(self, client):
self.client = client
self.chunker = SmartChunker()
def create_batch_file(self, documents: List[Dict[str, str]],
output_file: str = "batch_requests.jsonl"):
"""Create batch file for multiple document processing"""
batch_requests = []
for doc_id, doc_data in enumerate(documents):
document = doc_data["content"]
task = doc_data.get("task", "Analyze this document")
# Chunk document if too long
if count_tokens(document) > 100000: # 100K token limit for batch
chunks = self.chunker.chunk_by_paragraphs(document)
for chunk_id, chunk in enumerate(chunks):
request = {
"custom_id": f"doc_{doc_id}_chunk_{chunk_id}",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": "You are a document analysis assistant."},
{"role": "user", "content": f"{task}\n\nDocument chunk:\n{chunk}"}
]
}
}
batch_requests.append(request)
else:
request = {
"custom_id": f"doc_{doc_id}",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "deepseek-chat",
"messages": [
{"role": "system", "content": "You are a document analysis assistant."},
{"role": "user", "content": f"{task}\n\nDocument:\n{document}"}
]
}
}
batch_requests.append(request)
# Write to JSONL file
with open(output_file, 'w') as f:
for request in batch_requests:
f.write(json.dumps(request) + '\n')
return output_file, len(batch_requests)
def process_batch_results(self, results_file: str) -> Dict[str, List[str]]:
"""Process batch results and group by document"""
results = {}
with open(results_file, 'r') as f:
for line in f:
result = json.loads(line)
custom_id = result["custom_id"]
# Extract document ID
if "_chunk_" in custom_id:
doc_id = custom_id.split("_chunk_")[0]
else:
doc_id = custom_id
if doc_id not in results:
results[doc_id] = []
content = result["response"]["body"]["choices"][0]["message"]["content"]
results[doc_id].append(content)
return results
# Example usage
batch_processor = BatchProcessor(client)
# Prepare documents for batch processing
documents = [
{
"content": "Long document 1 content...",
"task": "Summarize the main points"
},
{
"content": "Long document 2 content...",
"task": "Extract key insights"
},
{
"content": "Long document 3 content...",
"task": "Identify action items"
}
]
# Create batch file
batch_file, request_count = batch_processor.create_batch_file(documents)
print(f"Created batch file with {request_count} requests")
# Note: You would then upload this file using the batch API
# and process results when complete
Advanced Techniques
Hierarchical Processing
python
class HierarchicalProcessor:
"""Process documents using hierarchical analysis"""
def __init__(self, client):
self.client = client
self.chunker = SmartChunker()
def hierarchical_analysis(self, document: str, levels: List[str]) -> Dict[str, Any]:
"""Perform multi-level hierarchical analysis"""
results = {}
current_text = document
for level in levels:
print(f"Processing level: {level}")
if level == "overview":
# High-level overview
results[level] = self.get_overview(current_text)
elif level == "sections":
# Section-by-section analysis
results[level] = self.analyze_sections(current_text)
elif level == "details":
# Detailed analysis
results[level] = self.detailed_analysis(current_text)
elif level == "synthesis":
# Synthesize all previous levels
results[level] = self.synthesize_analysis(results)
return results
def get_overview(self, text: str) -> str:
"""Get high-level overview of document"""
# Use first and last parts for overview
text_tokens = count_tokens(text)
if text_tokens > 8000:
beginning = text[:4000]
ending = text[-4000:]
sample_text = beginning + "\n\n[... middle content ...]\n\n" + ending
else:
sample_text = text
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a document analysis expert."},
{"role": "user", "content": f"""
Provide a high-level overview of this document:
{sample_text}
Include:
- Main topic and purpose
- Key themes
- Document structure
- Target audience
"""}
]
)
return response.choices[0].message.content
def analyze_sections(self, text: str) -> List[Dict[str, str]]:
"""Analyze document section by section"""
chunks = self.chunker.chunk_by_structure(text)
section_analyses = []
for i, chunk in enumerate(chunks):
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a section analysis expert."},
{"role": "user", "content": f"""
Analyze this document section:
{chunk}
Provide:
- Section summary
- Key points
- Important details
- Relationship to overall document
"""}
]
)
section_analyses.append({
"section_id": i + 1,
"content_preview": chunk[:200] + "...",
"analysis": response.choices[0].message.content
})
return section_analyses
def detailed_analysis(self, text: str) -> Dict[str, Any]:
"""Perform detailed analysis of specific aspects"""
aspects = [
"arguments_and_evidence",
"methodology",
"conclusions",
"limitations",
"implications"
]
detailed_results = {}
for aspect in aspects:
prompt = f"""
Analyze the {aspect.replace('_', ' ')} in this document:
{text[:8000]}...
Focus specifically on {aspect.replace('_', ' ')} and provide detailed insights.
"""
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": f"You are an expert in analyzing {aspect.replace('_', ' ')}."},
{"role": "user", "content": prompt}
]
)
detailed_results[aspect] = response.choices[0].message.content
return detailed_results
def synthesize_analysis(self, previous_results: Dict[str, Any]) -> str:
"""Synthesize all previous analysis levels"""
synthesis_input = ""
for level, result in previous_results.items():
if level != "synthesis": # Don't include synthesis in synthesis
synthesis_input += f"\n\n{level.upper()} ANALYSIS:\n{str(result)[:1000]}..."
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a synthesis expert who combines multiple analyses."},
{"role": "user", "content": f"""
Synthesize these multiple levels of analysis into a comprehensive understanding:
{synthesis_input}
Provide:
- Integrated insights
- Cross-level connections
- Overall assessment
- Key takeaways
"""}
]
)
return response.choices[0].message.content
# Example usage
hierarchical_processor = HierarchicalProcessor(client)
analysis_levels = ["overview", "sections", "details", "synthesis"]
hierarchical_results = hierarchical_processor.hierarchical_analysis(
long_document,
analysis_levels
)
for level, result in hierarchical_results.items():
print(f"\n{level.upper()} ANALYSIS:")
print("=" * 50)
print(result)
Best Practices
Optimization Guidelines
- Choose appropriate chunking: Use semantic boundaries when possible
- Preserve context: Maintain overlap between chunks for coherence
- Monitor token usage: Stay within model limits for optimal performance
- Use streaming: For real-time processing of long texts
- Implement caching: Cache results for repeated processing
Performance Tips
python
# ✅ Good: Smart chunking with context preservation
chunker = SmartChunker(max_chunk_size=6000, overlap_size=300)
chunks = chunker.chunk_by_paragraphs(document)
# ✅ Good: Monitor token usage
context_info = check_context_limit(messages)
if not context_info["within_limit"]:
# Handle oversized input
# ✅ Good: Use appropriate processing strategy
if count_tokens(document) > 50000:
# Use parallel processing for very long documents
results = parallel_processor.run_parallel_processing(document, task)
else:
# Use sequential processing for moderate length
results = processor.process_document_sequentially(document, task)
# ❌ Bad: No chunking for long documents
response = client.chat.completions.create(
model="deepseek-chat",
messages=[{"role": "user", "content": very_long_document}] # May exceed limits
)
# ❌ Bad: Ignoring token limits
# Not checking if input fits within context window
Troubleshooting
Common Issues
- Context length exceeded: Implement proper chunking
- Loss of coherence: Use overlapping chunks and context preservation
- Slow processing: Consider parallel processing for large documents
- Memory issues: Use streaming for very large texts
Debug Tools
python
def debug_long_text_processing(text: str, max_tokens: int = 128000):
"""Debug long text processing issues"""
print(f"Text length: {len(text)} characters")
print(f"Token count: {count_tokens(text)}")
print(f"Within limit: {count_tokens(text) <= max_tokens}")
if count_tokens(text) > max_tokens:
chunker = SmartChunker()
chunks = chunker.chunk_by_paragraphs(text)
print(f"Suggested chunks: {len(chunks)}")
for i, chunk in enumerate(chunks[:3]): # Show first 3 chunks
print(f"Chunk {i+1} tokens: {count_tokens(chunk)}")
return {
"needs_chunking": count_tokens(text) > max_tokens,
"suggested_chunks": len(chunker.chunk_by_paragraphs(text)) if count_tokens(text) > max_tokens else 1
}
# Usage
debug_info = debug_long_text_processing(long_document)
print("Debug info:", debug_info)