When you’re building production systems that rely on LLM APIs, you quickly learn that “it works in development” doesn’t mean much. Rate limits hit at the worst times, APIs go down, and costs can spiral if you’re not careful. Here’s how to build integrations that actually survive the real world.
The Problem with Naive Integrations#
Most tutorials show you something like this:
1
2
3
4
5
6
7
8
| import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
|
This works great until:
- You hit rate limits during a traffic spike
- The API returns a 500 error
- Your monthly bill arrives
- A timeout leaves your user hanging
Pattern 1: Exponential Backoff with Jitter#
Rate limits are inevitable. The key is handling them gracefully:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| import time
import random
from functools import wraps
def retry_with_backoff(max_retries=5, base_delay=1, max_delay=60):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
try:
return func(*args, **kwargs)
except anthropic.RateLimitError as e:
retries += 1
if retries == max_retries:
raise
# Exponential backoff with jitter
delay = min(base_delay * (2 ** retries), max_delay)
jitter = random.uniform(0, delay * 0.1)
print(f"Rate limited. Retry {retries}/{max_retries} in {delay:.1f}s")
time.sleep(delay + jitter)
except anthropic.APIStatusError as e:
if e.status_code >= 500:
retries += 1
if retries == max_retries:
raise
time.sleep(base_delay * retries)
else:
raise
return func(*args, **kwargs)
return wrapper
return decorator
@retry_with_backoff(max_retries=5)
def call_llm(prompt: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
|
The jitter prevents the “thundering herd” problem where all your retries hit simultaneously.
Pattern 2: Provider Fallbacks#
Don’t put all your eggs in one basket. When one provider is down or rate-limited, fall back to another:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
| from dataclasses import dataclass
from typing import Optional, List
import anthropic
import openai
@dataclass
class LLMProvider:
name: str
client: any
model: str
priority: int = 0
class ResilientLLM:
def __init__(self, providers: List[LLMProvider]):
self.providers = sorted(providers, key=lambda p: p.priority)
def complete(self, prompt: str, max_tokens: int = 1024) -> str:
errors = []
for provider in self.providers:
try:
return self._call_provider(provider, prompt, max_tokens)
except Exception as e:
errors.append(f"{provider.name}: {e}")
continue
raise Exception(f"All providers failed: {errors}")
def _call_provider(self, provider: LLMProvider, prompt: str, max_tokens: int) -> str:
if provider.name == "anthropic":
response = provider.client.messages.create(
model=provider.model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
elif provider.name == "openai":
response = provider.client.chat.completions.create(
model=provider.model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Usage
llm = ResilientLLM([
LLMProvider("anthropic", anthropic.Anthropic(), "claude-sonnet-4-20250514", priority=0),
LLMProvider("openai", openai.OpenAI(), "gpt-4o", priority=1),
])
|
Pattern 3: Response Caching#
Many LLM calls are repetitive. Cache them:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| import hashlib
import json
import redis
from typing import Optional
class CachedLLM:
def __init__(self, client, redis_url: str = "redis://localhost:6379"):
self.client = client
self.redis = redis.from_url(redis_url)
self.default_ttl = 3600 # 1 hour
def _cache_key(self, prompt: str, model: str, max_tokens: int) -> str:
content = f"{model}:{max_tokens}:{prompt}"
return f"llm:{hashlib.sha256(content.encode()).hexdigest()[:16]}"
def complete(
self,
prompt: str,
model: str = "claude-sonnet-4-20250514",
max_tokens: int = 1024,
use_cache: bool = True,
ttl: Optional[int] = None
) -> str:
cache_key = self._cache_key(prompt, model, max_tokens)
# Check cache first
if use_cache:
cached = self.redis.get(cache_key)
if cached:
return cached.decode('utf-8')
# Make the actual call
response = self.client.messages.create(
model=model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}]
)
result = response.content[0].text
# Cache the result
if use_cache:
self.redis.setex(cache_key, ttl or self.default_ttl, result)
return result
|
For semantic caching (matching similar but not identical prompts), consider using embeddings:
1
2
3
4
5
6
7
8
9
10
| # Semantic cache lookup
def find_similar_cached(self, prompt: str, threshold: float = 0.95) -> Optional[str]:
prompt_embedding = self.get_embedding(prompt)
# Search your vector store for similar prompts
results = self.vector_store.search(prompt_embedding, limit=1)
if results and results[0].score > threshold:
return self.redis.get(f"llm:{results[0].id}")
return None
|
Pattern 4: Circuit Breaker#
When an API is consistently failing, stop hammering it:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| import time
from enum import Enum
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, reject calls
HALF_OPEN = "half_open" # Testing if recovered
class CircuitBreaker:
def __init__(
self,
failure_threshold: int = 5,
recovery_timeout: int = 60
):
self.failure_threshold = failure_threshold
self.recovery_timeout = recovery_timeout
self.failures = 0
self.last_failure_time = 0
self.state = CircuitState.CLOSED
def call(self, func, *args, **kwargs):
if self.state == CircuitState.OPEN:
if time.time() - self.last_failure_time > self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
else:
raise Exception("Circuit breaker is OPEN")
try:
result = func(*args, **kwargs)
self._on_success()
return result
except Exception as e:
self._on_failure()
raise
def _on_success(self):
self.failures = 0
self.state = CircuitState.CLOSED
def _on_failure(self):
self.failures += 1
self.last_failure_time = time.time()
if self.failures >= self.failure_threshold:
self.state = CircuitState.OPEN
|
Pattern 5: Request Queuing and Batching#
For high-throughput scenarios, queue requests and process them within rate limits:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
| import asyncio
from asyncio import Queue
class RateLimitedQueue:
def __init__(self, requests_per_minute: int = 50):
self.queue = Queue()
self.interval = 60 / requests_per_minute
self.running = False
async def add(self, prompt: str) -> asyncio.Future:
future = asyncio.Future()
await self.queue.put((prompt, future))
return future
async def process(self):
self.running = True
while self.running:
prompt, future = await self.queue.get()
try:
result = await self._call_llm(prompt)
future.set_result(result)
except Exception as e:
future.set_exception(e)
await asyncio.sleep(self.interval)
|
Putting It All Together#
In production, combine these patterns:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| class ProductionLLM:
def __init__(self):
self.cache = CachedLLM(anthropic.Anthropic())
self.circuit_breaker = CircuitBreaker()
self.fallback_providers = [...]
@retry_with_backoff(max_retries=3)
def complete(self, prompt: str, **kwargs) -> str:
# Check cache first
cached = self.cache.get(prompt)
if cached:
return cached
# Try primary with circuit breaker
try:
return self.circuit_breaker.call(
self.cache.complete, prompt, **kwargs
)
except Exception:
# Fall back to alternatives
return self._try_fallbacks(prompt, **kwargs)
|
Final Thoughts#
Building resilient LLM integrations isn’t just about catching errors—it’s about designing for failure from the start. Rate limits, timeouts, and outages are normal. Your code should treat them that way.
The patterns here add complexity, but they pay off when your system stays up while others are scrambling. Start with retries and caching, add circuit breakers when you need them, and always have a fallback plan.
Your users (and your on-call rotation) will thank you.