When you’re building production systems that rely on LLM APIs, you quickly learn that “it works in development” doesn’t mean much. Rate limits hit at the worst times, APIs go down, and costs can spiral if you’re not careful. Here’s how to build integrations that actually survive the real world.

The Problem with Naive Integrations

Most tutorials show you something like this:

1
2
3
4
5
6
7
8
import anthropic

client = anthropic.Anthropic()
response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    messages=[{"role": "user", "content": prompt}]
)

This works great until:

  • You hit rate limits during a traffic spike
  • The API returns a 500 error
  • Your monthly bill arrives
  • A timeout leaves your user hanging

Pattern 1: Exponential Backoff with Jitter

Rate limits are inevitable. The key is handling them gracefully:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import time
import random
from functools import wraps

def retry_with_backoff(max_retries=5, base_delay=1, max_delay=60):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            retries = 0
            while retries < max_retries:
                try:
                    return func(*args, **kwargs)
                except anthropic.RateLimitError as e:
                    retries += 1
                    if retries == max_retries:
                        raise
                    
                    # Exponential backoff with jitter
                    delay = min(base_delay * (2 ** retries), max_delay)
                    jitter = random.uniform(0, delay * 0.1)
                    
                    print(f"Rate limited. Retry {retries}/{max_retries} in {delay:.1f}s")
                    time.sleep(delay + jitter)
                    
                except anthropic.APIStatusError as e:
                    if e.status_code >= 500:
                        retries += 1
                        if retries == max_retries:
                            raise
                        time.sleep(base_delay * retries)
                    else:
                        raise
            return func(*args, **kwargs)
        return wrapper
    return decorator

@retry_with_backoff(max_retries=5)
def call_llm(prompt: str) -> str:
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

The jitter prevents the “thundering herd” problem where all your retries hit simultaneously.

Pattern 2: Provider Fallbacks

Don’t put all your eggs in one basket. When one provider is down or rate-limited, fall back to another:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from dataclasses import dataclass
from typing import Optional, List
import anthropic
import openai

@dataclass
class LLMProvider:
    name: str
    client: any
    model: str
    priority: int = 0

class ResilientLLM:
    def __init__(self, providers: List[LLMProvider]):
        self.providers = sorted(providers, key=lambda p: p.priority)
    
    def complete(self, prompt: str, max_tokens: int = 1024) -> str:
        errors = []
        
        for provider in self.providers:
            try:
                return self._call_provider(provider, prompt, max_tokens)
            except Exception as e:
                errors.append(f"{provider.name}: {e}")
                continue
        
        raise Exception(f"All providers failed: {errors}")
    
    def _call_provider(self, provider: LLMProvider, prompt: str, max_tokens: int) -> str:
        if provider.name == "anthropic":
            response = provider.client.messages.create(
                model=provider.model,
                max_tokens=max_tokens,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text
            
        elif provider.name == "openai":
            response = provider.client.chat.completions.create(
                model=provider.model,
                max_tokens=max_tokens,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.choices[0].message.content

# Usage
llm = ResilientLLM([
    LLMProvider("anthropic", anthropic.Anthropic(), "claude-sonnet-4-20250514", priority=0),
    LLMProvider("openai", openai.OpenAI(), "gpt-4o", priority=1),
])

Pattern 3: Response Caching

Many LLM calls are repetitive. Cache them:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import hashlib
import json
import redis
from typing import Optional

class CachedLLM:
    def __init__(self, client, redis_url: str = "redis://localhost:6379"):
        self.client = client
        self.redis = redis.from_url(redis_url)
        self.default_ttl = 3600  # 1 hour
    
    def _cache_key(self, prompt: str, model: str, max_tokens: int) -> str:
        content = f"{model}:{max_tokens}:{prompt}"
        return f"llm:{hashlib.sha256(content.encode()).hexdigest()[:16]}"
    
    def complete(
        self, 
        prompt: str, 
        model: str = "claude-sonnet-4-20250514",
        max_tokens: int = 1024,
        use_cache: bool = True,
        ttl: Optional[int] = None
    ) -> str:
        cache_key = self._cache_key(prompt, model, max_tokens)
        
        # Check cache first
        if use_cache:
            cached = self.redis.get(cache_key)
            if cached:
                return cached.decode('utf-8')
        
        # Make the actual call
        response = self.client.messages.create(
            model=model,
            max_tokens=max_tokens,
            messages=[{"role": "user", "content": prompt}]
        )
        result = response.content[0].text
        
        # Cache the result
        if use_cache:
            self.redis.setex(cache_key, ttl or self.default_ttl, result)
        
        return result

For semantic caching (matching similar but not identical prompts), consider using embeddings:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
# Semantic cache lookup
def find_similar_cached(self, prompt: str, threshold: float = 0.95) -> Optional[str]:
    prompt_embedding = self.get_embedding(prompt)
    
    # Search your vector store for similar prompts
    results = self.vector_store.search(prompt_embedding, limit=1)
    
    if results and results[0].score > threshold:
        return self.redis.get(f"llm:{results[0].id}")
    return None

Pattern 4: Circuit Breaker

When an API is consistently failing, stop hammering it:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import time
from enum import Enum

class CircuitState(Enum):
    CLOSED = "closed"      # Normal operation
    OPEN = "open"          # Failing, reject calls
    HALF_OPEN = "half_open"  # Testing if recovered

class CircuitBreaker:
    def __init__(
        self, 
        failure_threshold: int = 5,
        recovery_timeout: int = 60
    ):
        self.failure_threshold = failure_threshold
        self.recovery_timeout = recovery_timeout
        self.failures = 0
        self.last_failure_time = 0
        self.state = CircuitState.CLOSED
    
    def call(self, func, *args, **kwargs):
        if self.state == CircuitState.OPEN:
            if time.time() - self.last_failure_time > self.recovery_timeout:
                self.state = CircuitState.HALF_OPEN
            else:
                raise Exception("Circuit breaker is OPEN")
        
        try:
            result = func(*args, **kwargs)
            self._on_success()
            return result
        except Exception as e:
            self._on_failure()
            raise
    
    def _on_success(self):
        self.failures = 0
        self.state = CircuitState.CLOSED
    
    def _on_failure(self):
        self.failures += 1
        self.last_failure_time = time.time()
        if self.failures >= self.failure_threshold:
            self.state = CircuitState.OPEN

Pattern 5: Request Queuing and Batching

For high-throughput scenarios, queue requests and process them within rate limits:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import asyncio
from asyncio import Queue

class RateLimitedQueue:
    def __init__(self, requests_per_minute: int = 50):
        self.queue = Queue()
        self.interval = 60 / requests_per_minute
        self.running = False
    
    async def add(self, prompt: str) -> asyncio.Future:
        future = asyncio.Future()
        await self.queue.put((prompt, future))
        return future
    
    async def process(self):
        self.running = True
        while self.running:
            prompt, future = await self.queue.get()
            try:
                result = await self._call_llm(prompt)
                future.set_result(result)
            except Exception as e:
                future.set_exception(e)
            await asyncio.sleep(self.interval)

Putting It All Together

In production, combine these patterns:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
class ProductionLLM:
    def __init__(self):
        self.cache = CachedLLM(anthropic.Anthropic())
        self.circuit_breaker = CircuitBreaker()
        self.fallback_providers = [...]
    
    @retry_with_backoff(max_retries=3)
    def complete(self, prompt: str, **kwargs) -> str:
        # Check cache first
        cached = self.cache.get(prompt)
        if cached:
            return cached
        
        # Try primary with circuit breaker
        try:
            return self.circuit_breaker.call(
                self.cache.complete, prompt, **kwargs
            )
        except Exception:
            # Fall back to alternatives
            return self._try_fallbacks(prompt, **kwargs)

Final Thoughts

Building resilient LLM integrations isn’t just about catching errors—it’s about designing for failure from the start. Rate limits, timeouts, and outages are normal. Your code should treat them that way.

The patterns here add complexity, but they pay off when your system stays up while others are scrambling. Start with retries and caching, add circuit breakers when you need them, and always have a fallback plan.

Your users (and your on-call rotation) will thank you.