Long Text Processing Guide

Learn how to effectively handle long documents, large datasets, and extended conversations with DeepSeek's advanced text processing capabilities.

Overview

DeepSeek supports processing of long texts with:

Extended context length: Up to 128K tokens for comprehensive analysis
Efficient chunking: Smart text segmentation strategies
Context preservation: Maintain coherence across long documents
Memory optimization: Efficient handling of large inputs
Streaming support: Process long texts with real-time output

Context Length Limits

Understanding Token Limits

python

import tiktoken
from openai import OpenAI

client = OpenAI(
    api_key="YOUR_API_KEY",
    base_url="https://api.deepseek.com/v1"
)

def count_tokens(text: str, model: str = "deepseek-chat") -> int:
    """Count tokens in text for the specified model"""
    
    # Use tiktoken for token counting
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")  # Similar tokenization
    tokens = encoding.encode(text)
    return len(tokens)

def check_context_limit(messages: list, model: str = "deepseek-chat") -> dict:
    """Check if messages fit within context limit"""
    
    # Model limits
    limits = {
        "deepseek-chat": 128000,  # 128K tokens
        "deepseek-coder": 128000,
        "deepseek-math": 128000
    }
    
    total_tokens = 0
    for message in messages:
        total_tokens += count_tokens(message["content"])
    
    limit = limits.get(model, 128000)
    
    return {
        "total_tokens": total_tokens,
        "limit": limit,
        "within_limit": total_tokens <= limit,
        "usage_percentage": (total_tokens / limit) * 100,
        "remaining_tokens": limit - total_tokens
    }

# Example usage
long_document = """
[Your long document content here...]
"""

messages = [
    {"role": "system", "content": "You are a document analysis assistant."},
    {"role": "user", "content": f"Please analyze this document:\n\n{long_document}"}
]

context_info = check_context_limit(messages)
print(f"Token usage: {context_info['total_tokens']}/{context_info['limit']} ({context_info['usage_percentage']:.1f}%)")

Model-Specific Limits

python

class ContextManager:
    """Manage context limits for different models"""
    
    MODEL_LIMITS = {
        "deepseek-chat": 128000,
        "deepseek-coder": 128000,
        "deepseek-math": 128000
    }
    
    def __init__(self, model: str = "deepseek-chat"):
        self.model = model
        self.limit = self.MODEL_LIMITS.get(model, 128000)
    
    def can_fit(self, text: str) -> bool:
        """Check if text fits within model limit"""
        return count_tokens(text) <= self.limit
    
    def get_max_input_size(self, reserved_for_response: int = 4000) -> int:
        """Get maximum input size, reserving tokens for response"""
        return self.limit - reserved_for_response
    
    def estimate_response_tokens(self, input_tokens: int, task_type: str = "general") -> int:
        """Estimate response tokens based on task type"""
        
        estimates = {
            "summary": input_tokens * 0.1,      # 10% of input
            "analysis": input_tokens * 0.3,     # 30% of input
            "translation": input_tokens * 1.2,  # 120% of input
            "code_generation": input_tokens * 0.5,  # 50% of input
            "general": input_tokens * 0.2       # 20% of input
        }
        
        return int(estimates.get(task_type, estimates["general"]))

# Usage
manager = ContextManager("deepseek-chat")
max_input = manager.get_max_input_size()
print(f"Maximum input size: {max_input} tokens")

Text Chunking Strategies

Smart Chunking

python

import re
from typing import List, Tuple

class SmartChunker:
    """Intelligent text chunking with context preservation"""
    
    def __init__(self, max_chunk_size: int = 8000, overlap_size: int = 200):
        self.max_chunk_size = max_chunk_size
        self.overlap_size = overlap_size
    
    def chunk_by_paragraphs(self, text: str) -> List[str]:
        """Chunk text by paragraphs, respecting natural boundaries"""
        
        # Split by double newlines (paragraphs)
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""
        
        for paragraph in paragraphs:
            paragraph = paragraph.strip()
            if not paragraph:
                continue
            
            # Check if adding this paragraph exceeds limit
            test_chunk = current_chunk + "\n\n" + paragraph if current_chunk else paragraph
            
            if count_tokens(test_chunk) <= self.max_chunk_size:
                current_chunk = test_chunk
            else:
                # Save current chunk and start new one
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = paragraph
        
        # Add the last chunk
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks
    
    def chunk_by_sentences(self, text: str) -> List[str]:
        """Chunk text by sentences for better coherence"""
        
        # Split by sentence endings
        sentences = re.split(r'(?<=[.!?])\s+', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            test_chunk = current_chunk + " " + sentence if current_chunk else sentence
            
            if count_tokens(test_chunk) <= self.max_chunk_size:
                current_chunk = test_chunk
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence
        
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks
    
    def chunk_with_overlap(self, text: str) -> List[Tuple[str, int, int]]:
        """Chunk text with overlapping sections for context preservation"""
        
        chunks = []
        start = 0
        text_length = len(text)
        
        while start < text_length:
            # Calculate end position
            end = min(start + self.max_chunk_size, text_length)
            
            # Find a good breaking point (sentence or paragraph end)
            if end < text_length:
                # Look for sentence ending within last 200 characters
                search_start = max(end - 200, start)
                sentence_end = text.rfind('.', search_start, end)
                if sentence_end > start:
                    end = sentence_end + 1
            
            chunk = text[start:end].strip()
            chunks.append((chunk, start, end))
            
            # Move start position with overlap
            start = max(start + self.max_chunk_size - self.overlap_size, end)
        
        return chunks
    
    def chunk_by_structure(self, text: str, structure_markers: List[str] = None) -> List[str]:
        """Chunk text based on structural markers (headers, sections)"""
        
        if structure_markers is None:
            structure_markers = [
                r'^#+ ',      # Markdown headers
                r'^\d+\.',    # Numbered sections
                r'^Chapter ', # Chapter markers
                r'^Section ', # Section markers
            ]
        
        # Find all structure markers
        markers = []
        for i, line in enumerate(text.split('\n')):
            for pattern in structure_markers:
                if re.match(pattern, line):
                    markers.append(i)
                    break
        
        if not markers:
            # No structure found, fall back to paragraph chunking
            return self.chunk_by_paragraphs(text)
        
        lines = text.split('\n')
        chunks = []
        
        for i in range(len(markers)):
            start_line = markers[i]
            end_line = markers[i + 1] if i + 1 < len(markers) else len(lines)
            
            section = '\n'.join(lines[start_line:end_line]).strip()
            
            # If section is too large, further chunk it
            if count_tokens(section) > self.max_chunk_size:
                sub_chunks = self.chunk_by_paragraphs(section)
                chunks.extend(sub_chunks)
            else:
                chunks.append(section)
        
        return chunks

# Example usage
chunker = SmartChunker(max_chunk_size=6000, overlap_size=300)

long_text = """
Your very long document content here...
Multiple paragraphs, sections, etc.
"""

# Different chunking strategies
paragraph_chunks = chunker.chunk_by_paragraphs(long_text)
sentence_chunks = chunker.chunk_by_sentences(long_text)
overlap_chunks = chunker.chunk_with_overlap(long_text)
structure_chunks = chunker.chunk_by_structure(long_text)

print(f"Paragraph chunks: {len(paragraph_chunks)}")
print(f"Sentence chunks: {len(sentence_chunks)}")
print(f"Overlap chunks: {len(overlap_chunks)}")
print(f"Structure chunks: {len(structure_chunks)}")

Semantic Chunking

python

class SemanticChunker:
    """Chunk text based on semantic similarity"""
    
    def __init__(self, client, max_chunk_size: int = 6000):
        self.client = client
        self.max_chunk_size = max_chunk_size
    
    def get_semantic_boundaries(self, text: str) -> List[int]:
        """Find semantic boundaries in text"""
        
        paragraphs = text.split('\n\n')
        boundaries = []
        
        for i in range(len(paragraphs) - 1):
            # Get semantic similarity between adjacent paragraphs
            similarity = self.calculate_semantic_similarity(
                paragraphs[i], 
                paragraphs[i + 1]
            )
            
            # If similarity is low, it's a good boundary
            if similarity < 0.5:  # Threshold for semantic break
                boundaries.append(i + 1)
        
        return boundaries
    
    def calculate_semantic_similarity(self, text1: str, text2: str) -> float:
        """Calculate semantic similarity between two texts"""
        
        prompt = f"""
Rate the semantic similarity between these two text segments on a scale of 0.0 to 1.0:

Text 1: {text1[:500]}...
Text 2: {text2[:500]}...

Return only a number between 0.0 and 1.0, where:
- 0.0 = completely different topics
- 1.0 = same topic and context

Similarity score:"""
        
        try:
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                max_tokens=10,
                temperature=0
            )
            
            score_text = response.choices[0].message.content.strip()
            return float(score_text)
        except:
            return 0.5  # Default similarity if calculation fails
    
    def chunk_semantically(self, text: str) -> List[str]:
        """Chunk text based on semantic boundaries"""
        
        paragraphs = text.split('\n\n')
        boundaries = self.get_semantic_boundaries(text)
        
        chunks = []
        start = 0
        
        for boundary in boundaries + [len(paragraphs)]:
            chunk_paragraphs = paragraphs[start:boundary]
            chunk = '\n\n'.join(chunk_paragraphs)
            
            # If chunk is too large, split it further
            if count_tokens(chunk) > self.max_chunk_size:
                sub_chunker = SmartChunker(self.max_chunk_size)
                sub_chunks = sub_chunker.chunk_by_paragraphs(chunk)
                chunks.extend(sub_chunks)
            else:
                chunks.append(chunk)
            
            start = boundary
        
        return chunks

# Usage (requires API calls, use sparingly)
# semantic_chunker = SemanticChunker(client)
# semantic_chunks = semantic_chunker.chunk_semantically(long_text)

Document Processing Workflows

Sequential Processing

python

class DocumentProcessor:
    """Process long documents sequentially with context preservation"""
    
    def __init__(self, client, chunker: SmartChunker = None):
        self.client = client
        self.chunker = chunker or SmartChunker()
    
    def process_document_sequentially(self, document: str, task: str, 
                                    context_preservation: bool = True) -> List[str]:
        """Process document in chunks while preserving context"""
        
        chunks = self.chunker.chunk_by_paragraphs(document)
        results = []
        previous_context = ""
        
        for i, chunk in enumerate(chunks):
            # Build context-aware prompt
            if context_preservation and previous_context:
                prompt = f"""
Previous context: {previous_context[-500:]}...

Current section: {chunk}

Task: {task}

Please process the current section while considering the previous context.
"""
            else:
                prompt = f"""
Section {i+1} of {len(chunks)}:

{chunk}

Task: {task}
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a document processing assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            
            result = response.choices[0].message.content
            results.append(result)
            
            # Update context for next iteration
            if context_preservation:
                previous_context = chunk[-300:] + " " + result[-200:]
        
        return results
    
    def summarize_long_document(self, document: str, summary_type: str = "comprehensive") -> str:
        """Create a summary of a long document"""
        
        # First pass: summarize each chunk
        chunk_summaries = self.process_document_sequentially(
            document, 
            f"Create a {summary_type} summary of this section"
        )
        
        # Second pass: combine summaries
        combined_summaries = "\n\n".join(chunk_summaries)
        
        if count_tokens(combined_summaries) > 8000:
            # If combined summaries are still too long, summarize again
            final_summary_chunks = self.chunker.chunk_by_paragraphs(combined_summaries)
            final_summaries = []
            
            for chunk in final_summary_chunks:
                response = self.client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[
                        {"role": "system", "content": "You are a summarization expert."},
                        {"role": "user", "content": f"Condense this summary further:\n\n{chunk}"}
                    ]
                )
                final_summaries.append(response.choices[0].message.content)
            
            combined_summaries = "\n\n".join(final_summaries)
        
        # Final consolidation
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a document summarization expert."},
                {"role": "user", "content": f"""
Create a final {summary_type} summary from these section summaries:

{combined_summaries}

Ensure the final summary is coherent, comprehensive, and well-structured.
"""}
            ]
        )
        
        return response.choices[0].message.content
    
    def extract_key_information(self, document: str, extraction_criteria: List[str]) -> dict:
        """Extract specific information from long document"""
        
        criteria_text = "\n".join([f"- {criterion}" for criterion in extraction_criteria])
        
        chunks = self.chunker.chunk_by_paragraphs(document)
        extracted_info = {criterion: [] for criterion in extraction_criteria}
        
        for chunk in chunks:
            prompt = f"""
Extract the following information from this text section:

{criteria_text}

Text section:
{chunk}

Return the information in a structured format. If information is not found, indicate "Not found".
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are an information extraction specialist."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"}
            )
            
            try:
                chunk_info = json.loads(response.choices[0].message.content)
                for criterion in extraction_criteria:
                    if criterion in chunk_info and chunk_info[criterion] != "Not found":
                        extracted_info[criterion].append(chunk_info[criterion])
            except:
                continue
        
        # Consolidate extracted information
        consolidated = {}
        for criterion, values in extracted_info.items():
            if values:
                consolidated[criterion] = list(set(values))  # Remove duplicates
            else:
                consolidated[criterion] = "Not found"
        
        return consolidated

# Example usage
processor = DocumentProcessor(client)

# Summarize long document
summary = processor.summarize_long_document(
    long_document, 
    summary_type="executive"
)

# Extract specific information
key_info = processor.extract_key_information(
    long_document,
    [
        "main_conclusions",
        "key_statistics",
        "recommendations",
        "important_dates",
        "mentioned_companies"
    ]
)

print("Summary:", summary)
print("Key Information:", key_info)

Parallel Processing

python

import asyncio
import aiohttp
from typing import List, Dict, Any

class ParallelDocumentProcessor:
    """Process document chunks in parallel for faster processing"""
    
    def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
        self.api_key = api_key
        self.base_url = base_url
        self.chunker = SmartChunker()
    
    async def process_chunk_async(self, session: aiohttp.ClientSession, 
                                chunk: str, task: str, chunk_id: int) -> Dict[str, Any]:
        """Process a single chunk asynchronously"""
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "deepseek-chat",
            "messages": [
                {"role": "system", "content": "You are a document processing assistant."},
                {"role": "user", "content": f"Chunk {chunk_id}: {task}\n\n{chunk}"}
            ]
        }
        
        async with session.post(
            f"{self.base_url}/chat/completions",
            headers=headers,
            json=payload
        ) as response:
            result = await response.json()
            return {
                "chunk_id": chunk_id,
                "result": result["choices"][0]["message"]["content"],
                "chunk": chunk[:100] + "..."  # Store snippet for reference
            }
    
    async def process_document_parallel(self, document: str, task: str, 
                                      max_concurrent: int = 5) -> List[Dict[str, Any]]:
        """Process document chunks in parallel"""
        
        chunks = self.chunker.chunk_by_paragraphs(document)
        
        # Create semaphore to limit concurrent requests
        semaphore = asyncio.Semaphore(max_concurrent)
        
        async def process_with_semaphore(session, chunk, task, chunk_id):
            async with semaphore:
                return await self.process_chunk_async(session, chunk, task, chunk_id)
        
        async with aiohttp.ClientSession() as session:
            tasks = [
                process_with_semaphore(session, chunk, task, i)
                for i, chunk in enumerate(chunks)
            ]
            
            results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Filter out exceptions and sort by chunk_id
        valid_results = [r for r in results if not isinstance(r, Exception)]
        valid_results.sort(key=lambda x: x["chunk_id"])
        
        return valid_results
    
    def run_parallel_processing(self, document: str, task: str) -> List[str]:
        """Run parallel processing (synchronous wrapper)"""
        
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        
        try:
            results = loop.run_until_complete(
                self.process_document_parallel(document, task)
            )
            return [result["result"] for result in results]
        finally:
            loop.close()

# Example usage
# parallel_processor = ParallelDocumentProcessor("YOUR_API_KEY")
# parallel_results = parallel_processor.run_parallel_processing(
#     long_document,
#     "Analyze the main themes and arguments in this section"
# )

Memory-Efficient Processing

Streaming Long Text

python

class StreamingProcessor:
    """Process long texts with streaming for memory efficiency"""
    
    def __init__(self, client):
        self.client = client
    
    def stream_long_text_analysis(self, text: str, analysis_type: str):
        """Stream analysis of long text"""
        
        prompt = f"""
Perform {analysis_type} analysis of this text. Stream your response as you analyze:

{text}
"""
        
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a text analysis expert. Provide streaming analysis."},
                {"role": "user", "content": prompt}
            ],
            stream=True
        )
        
        full_analysis = ""
        for chunk in response:
            if chunk.choices[0].delta.content:
                content = chunk.choices[0].delta.content
                full_analysis += content
                print(content, end="", flush=True)
        
        return full_analysis
    
    def progressive_summarization(self, text: str, target_length: int = 500):
        """Progressively summarize text to target length"""
        
        current_text = text
        iteration = 1
        
        while count_tokens(current_text) > target_length:
            print(f"Summarization iteration {iteration}...")
            
            # Calculate compression ratio needed
            current_tokens = count_tokens(current_text)
            compression_ratio = target_length / current_tokens
            
            prompt = f"""
Summarize this text to approximately {int(current_tokens * compression_ratio)} tokens:

{current_text}

Maintain key information and structure while reducing length.
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a summarization expert."},
                    {"role": "user", "content": prompt}
                ],
                stream=True
            )
            
            current_text = ""
            for chunk in response:
                if chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    current_text += content
                    print(content, end="", flush=True)
            
            print(f"\nIteration {iteration} complete. Tokens: {count_tokens(current_text)}")
            iteration += 1
            
            if iteration > 5:  # Prevent infinite loops
                break
        
        return current_text

# Example usage
streaming_processor = StreamingProcessor(client)

# Stream analysis
print("Streaming analysis:")
analysis = streaming_processor.stream_long_text_analysis(
    long_document,
    "thematic"
)

# Progressive summarization
print("\nProgressive summarization:")
final_summary = streaming_processor.progressive_summarization(
    long_document,
    target_length=300
)

Batch Processing

python

class BatchProcessor:
    """Process multiple long documents efficiently"""
    
    def __init__(self, client):
        self.client = client
        self.chunker = SmartChunker()
    
    def create_batch_file(self, documents: List[Dict[str, str]], 
                         output_file: str = "batch_requests.jsonl"):
        """Create batch file for multiple document processing"""
        
        batch_requests = []
        
        for doc_id, doc_data in enumerate(documents):
            document = doc_data["content"]
            task = doc_data.get("task", "Analyze this document")
            
            # Chunk document if too long
            if count_tokens(document) > 100000:  # 100K token limit for batch
                chunks = self.chunker.chunk_by_paragraphs(document)
                
                for chunk_id, chunk in enumerate(chunks):
                    request = {
                        "custom_id": f"doc_{doc_id}_chunk_{chunk_id}",
                        "method": "POST",
                        "url": "/v1/chat/completions",
                        "body": {
                            "model": "deepseek-chat",
                            "messages": [
                                {"role": "system", "content": "You are a document analysis assistant."},
                                {"role": "user", "content": f"{task}\n\nDocument chunk:\n{chunk}"}
                            ]
                        }
                    }
                    batch_requests.append(request)
            else:
                request = {
                    "custom_id": f"doc_{doc_id}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": "deepseek-chat",
                        "messages": [
                            {"role": "system", "content": "You are a document analysis assistant."},
                            {"role": "user", "content": f"{task}\n\nDocument:\n{document}"}
                        ]
                    }
                }
                batch_requests.append(request)
        
        # Write to JSONL file
        with open(output_file, 'w') as f:
            for request in batch_requests:
                f.write(json.dumps(request) + '\n')
        
        return output_file, len(batch_requests)
    
    def process_batch_results(self, results_file: str) -> Dict[str, List[str]]:
        """Process batch results and group by document"""
        
        results = {}
        
        with open(results_file, 'r') as f:
            for line in f:
                result = json.loads(line)
                custom_id = result["custom_id"]
                
                # Extract document ID
                if "_chunk_" in custom_id:
                    doc_id = custom_id.split("_chunk_")[0]
                else:
                    doc_id = custom_id
                
                if doc_id not in results:
                    results[doc_id] = []
                
                content = result["response"]["body"]["choices"][0]["message"]["content"]
                results[doc_id].append(content)
        
        return results

# Example usage
batch_processor = BatchProcessor(client)

# Prepare documents for batch processing
documents = [
    {
        "content": "Long document 1 content...",
        "task": "Summarize the main points"
    },
    {
        "content": "Long document 2 content...",
        "task": "Extract key insights"
    },
    {
        "content": "Long document 3 content...",
        "task": "Identify action items"
    }
]

# Create batch file
batch_file, request_count = batch_processor.create_batch_file(documents)
print(f"Created batch file with {request_count} requests")

# Note: You would then upload this file using the batch API
# and process results when complete

Advanced Techniques

Hierarchical Processing

python

class HierarchicalProcessor:
    """Process documents using hierarchical analysis"""
    
    def __init__(self, client):
        self.client = client
        self.chunker = SmartChunker()
    
    def hierarchical_analysis(self, document: str, levels: List[str]) -> Dict[str, Any]:
        """Perform multi-level hierarchical analysis"""
        
        results = {}
        current_text = document
        
        for level in levels:
            print(f"Processing level: {level}")
            
            if level == "overview":
                # High-level overview
                results[level] = self.get_overview(current_text)
            
            elif level == "sections":
                # Section-by-section analysis
                results[level] = self.analyze_sections(current_text)
            
            elif level == "details":
                # Detailed analysis
                results[level] = self.detailed_analysis(current_text)
            
            elif level == "synthesis":
                # Synthesize all previous levels
                results[level] = self.synthesize_analysis(results)
        
        return results
    
    def get_overview(self, text: str) -> str:
        """Get high-level overview of document"""
        
        # Use first and last parts for overview
        text_tokens = count_tokens(text)
        if text_tokens > 8000:
            beginning = text[:4000]
            ending = text[-4000:]
            sample_text = beginning + "\n\n[... middle content ...]\n\n" + ending
        else:
            sample_text = text
        
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a document analysis expert."},
                {"role": "user", "content": f"""
Provide a high-level overview of this document:

{sample_text}

Include:
- Main topic and purpose
- Key themes
- Document structure
- Target audience
"""}
            ]
        )
        
        return response.choices[0].message.content
    
    def analyze_sections(self, text: str) -> List[Dict[str, str]]:
        """Analyze document section by section"""
        
        chunks = self.chunker.chunk_by_structure(text)
        section_analyses = []
        
        for i, chunk in enumerate(chunks):
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a section analysis expert."},
                    {"role": "user", "content": f"""
Analyze this document section:

{chunk}

Provide:
- Section summary
- Key points
- Important details
- Relationship to overall document
"""}
                ]
            )
            
            section_analyses.append({
                "section_id": i + 1,
                "content_preview": chunk[:200] + "...",
                "analysis": response.choices[0].message.content
            })
        
        return section_analyses
    
    def detailed_analysis(self, text: str) -> Dict[str, Any]:
        """Perform detailed analysis of specific aspects"""
        
        aspects = [
            "arguments_and_evidence",
            "methodology",
            "conclusions",
            "limitations",
            "implications"
        ]
        
        detailed_results = {}
        
        for aspect in aspects:
            prompt = f"""
Analyze the {aspect.replace('_', ' ')} in this document:

{text[:8000]}...

Focus specifically on {aspect.replace('_', ' ')} and provide detailed insights.
"""
            
            response = self.client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": f"You are an expert in analyzing {aspect.replace('_', ' ')}."},
                    {"role": "user", "content": prompt}
                ]
            )
            
            detailed_results[aspect] = response.choices[0].message.content
        
        return detailed_results
    
    def synthesize_analysis(self, previous_results: Dict[str, Any]) -> str:
        """Synthesize all previous analysis levels"""
        
        synthesis_input = ""
        for level, result in previous_results.items():
            if level != "synthesis":  # Don't include synthesis in synthesis
                synthesis_input += f"\n\n{level.upper()} ANALYSIS:\n{str(result)[:1000]}..."
        
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a synthesis expert who combines multiple analyses."},
                {"role": "user", "content": f"""
Synthesize these multiple levels of analysis into a comprehensive understanding:

{synthesis_input}

Provide:
- Integrated insights
- Cross-level connections
- Overall assessment
- Key takeaways
"""}
            ]
        )
        
        return response.choices[0].message.content

# Example usage
hierarchical_processor = HierarchicalProcessor(client)

analysis_levels = ["overview", "sections", "details", "synthesis"]
hierarchical_results = hierarchical_processor.hierarchical_analysis(
    long_document,
    analysis_levels
)

for level, result in hierarchical_results.items():
    print(f"\n{level.upper()} ANALYSIS:")
    print("=" * 50)
    print(result)

Best Practices

Optimization Guidelines

Choose appropriate chunking: Use semantic boundaries when possible
Preserve context: Maintain overlap between chunks for coherence
Monitor token usage: Stay within model limits for optimal performance
Use streaming: For real-time processing of long texts
Implement caching: Cache results for repeated processing

Performance Tips

python

# ✅ Good: Smart chunking with context preservation
chunker = SmartChunker(max_chunk_size=6000, overlap_size=300)
chunks = chunker.chunk_by_paragraphs(document)

# ✅ Good: Monitor token usage
context_info = check_context_limit(messages)
if not context_info["within_limit"]:
    # Handle oversized input

# ✅ Good: Use appropriate processing strategy
if count_tokens(document) > 50000:
    # Use parallel processing for very long documents
    results = parallel_processor.run_parallel_processing(document, task)
else:
    # Use sequential processing for moderate length
    results = processor.process_document_sequentially(document, task)

# ❌ Bad: No chunking for long documents
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role": "user", "content": very_long_document}]  # May exceed limits
)

# ❌ Bad: Ignoring token limits
# Not checking if input fits within context window

Troubleshooting

Common Issues

Context length exceeded: Implement proper chunking
Loss of coherence: Use overlapping chunks and context preservation
Slow processing: Consider parallel processing for large documents
Memory issues: Use streaming for very large texts

Debug Tools

python

def debug_long_text_processing(text: str, max_tokens: int = 128000):
    """Debug long text processing issues"""
    
    print(f"Text length: {len(text)} characters")
    print(f"Token count: {count_tokens(text)}")
    print(f"Within limit: {count_tokens(text) <= max_tokens}")
    
    if count_tokens(text) > max_tokens:
        chunker = SmartChunker()
        chunks = chunker.chunk_by_paragraphs(text)
        print(f"Suggested chunks: {len(chunks)}")
        
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"Chunk {i+1} tokens: {count_tokens(chunk)}")
    
    return {
        "needs_chunking": count_tokens(text) > max_tokens,
        "suggested_chunks": len(chunker.chunk_by_paragraphs(text)) if count_tokens(text) > max_tokens else 1
    }

# Usage
debug_info = debug_long_text_processing(long_document)
print("Debug info:", debug_info)

Long Text Processing Guide ​

Overview ​

Context Length Limits ​

Understanding Token Limits ​

Model-Specific Limits ​

Text Chunking Strategies ​

Smart Chunking ​

Semantic Chunking ​

Document Processing Workflows ​

Sequential Processing ​

Parallel Processing ​

Memory-Efficient Processing ​

Streaming Long Text ​

Batch Processing ​

Advanced Techniques ​

Hierarchical Processing ​

Best Practices ​

Optimization Guidelines ​

Performance Tips ​

Troubleshooting ​

Common Issues ​

Debug Tools ​

Next Steps ​

Long Text Processing Guide

Overview

Context Length Limits

Understanding Token Limits

Model-Specific Limits

Text Chunking Strategies

Smart Chunking

Semantic Chunking

Document Processing Workflows

Sequential Processing

Parallel Processing

Memory-Efficient Processing

Streaming Long Text

Batch Processing

Advanced Techniques

Hierarchical Processing

Best Practices

Optimization Guidelines

Performance Tips

Troubleshooting

Common Issues

Debug Tools

Next Steps