LLM APIs are deceptively simple: send a prompt, get text back. But building reliable AI features requires handling rate limits, managing costs, structuring outputs, and gracefully degrading when things go wrong.
Here are the patterns that work in production.
The Basic Client#
Start with a wrapper that handles common concerns:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| import os
import time
from typing import Optional
import anthropic
from tenacity import retry, stop_after_attempt, wait_exponential
class LLMClient:
def __init__(self):
self.client = anthropic.Anthropic()
self.default_model = "claude-sonnet-4-20250514"
self.max_tokens = 4096
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=60)
)
def complete(
self,
prompt: str,
system: Optional[str] = None,
model: Optional[str] = None,
max_tokens: Optional[int] = None
) -> str:
messages = [{"role": "user", "content": prompt}]
response = self.client.messages.create(
model=model or self.default_model,
max_tokens=max_tokens or self.max_tokens,
system=system or "",
messages=messages
)
return response.content[0].text
|
The tenacity library handles retries with exponential backoff — essential for rate limits and transient errors.
Structured Outputs#
Raw text is hard to work with. Force structured responses:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
| import json
from pydantic import BaseModel, ValidationError
class ProductReview(BaseModel):
sentiment: str # positive, negative, neutral
score: float # 0.0 to 1.0
summary: str
key_points: list[str]
def analyze_review(review_text: str) -> ProductReview:
prompt = f"""Analyze this product review and respond with JSON only:
Review: {review_text}
Respond with this exact JSON structure:
{{
"sentiment": "positive|negative|neutral",
"score": 0.0-1.0,
"summary": "one sentence summary",
"key_points": ["point 1", "point 2"]
}}"""
response = llm.complete(
prompt=prompt,
system="You are a JSON-only response bot. Output valid JSON, nothing else."
)
# Parse and validate
try:
data = json.loads(response)
return ProductReview(**data)
except (json.JSONDecodeError, ValidationError) as e:
# Retry with stricter prompt or return default
raise ValueError(f"Failed to parse LLM response: {e}")
|
For even more reliability, use Claude’s tool use feature to guarantee structured output:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| def analyze_review_with_tools(review_text: str) -> ProductReview:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=[{
"name": "submit_analysis",
"description": "Submit the review analysis",
"input_schema": ProductReview.model_json_schema()
}],
messages=[{
"role": "user",
"content": f"Analyze this review: {review_text}"
}]
)
# Extract tool call
for block in response.content:
if block.type == "tool_use":
return ProductReview(**block.input)
|
Cost Control#
LLM calls add up fast. Implement guards:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
| from functools import lru_cache
import hashlib
class CostAwareLLM:
def __init__(self, daily_budget_usd: float = 10.0):
self.daily_budget = daily_budget_usd
self.daily_spend = 0.0
self.cost_per_1k_input = 0.003 # Adjust per model
self.cost_per_1k_output = 0.015
def estimate_cost(self, input_tokens: int, output_tokens: int) -> float:
return (
(input_tokens / 1000) * self.cost_per_1k_input +
(output_tokens / 1000) * self.cost_per_1k_output
)
def complete(self, prompt: str, **kwargs) -> str:
# Check budget
if self.daily_spend >= self.daily_budget:
raise BudgetExceededError("Daily LLM budget exceeded")
response = self._call_api(prompt, **kwargs)
# Track spend
cost = self.estimate_cost(
response.usage.input_tokens,
response.usage.output_tokens
)
self.daily_spend += cost
return response.content[0].text
@lru_cache(maxsize=1000)
def complete_cached(self, prompt: str, **kwargs) -> str:
"""Cache identical prompts to avoid redundant calls."""
return self.complete(prompt, **kwargs)
|
Graceful Degradation#
When the LLM fails, have a fallback:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
| def categorize_ticket(ticket_text: str) -> str:
"""Categorize support ticket with LLM, fall back to keywords."""
try:
result = llm.complete(
prompt=f"Categorize this support ticket into one of: billing, technical, general\n\nTicket: {ticket_text}",
max_tokens=50
)
category = result.strip().lower()
if category in ["billing", "technical", "general"]:
return category
except Exception as e:
logger.warning(f"LLM categorization failed: {e}")
# Keyword fallback
text_lower = ticket_text.lower()
if any(w in text_lower for w in ["invoice", "charge", "payment", "refund"]):
return "billing"
elif any(w in text_lower for w in ["error", "bug", "crash", "not working"]):
return "technical"
return "general"
|
Streaming for UX#
For user-facing features, stream responses:
1
2
3
4
5
6
7
8
9
10
| async def stream_response(prompt: str):
"""Stream LLM response for better perceived latency."""
with client.messages.stream(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
) as stream:
for text in stream.text_stream:
yield text
|
In a web framework:
1
2
3
4
5
6
7
8
9
10
11
| from fastapi import FastAPI
from fastapi.responses import StreamingResponse
app = FastAPI()
@app.get("/chat")
async def chat(prompt: str):
return StreamingResponse(
stream_response(prompt),
media_type="text/event-stream"
)
|
Prompt Management#
Don’t hardcode prompts. Version and manage them:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
| # prompts/summarize.txt
"""
Summarize the following text in {{style}} style.
Keep it under {{max_words}} words.
Focus on: {{focus_areas}}
Text: {{text}}
"""
# prompt_manager.py
from pathlib import Path
from string import Template
class PromptManager:
def __init__(self, prompts_dir: str = "prompts"):
self.prompts_dir = Path(prompts_dir)
self._cache = {}
def get(self, name: str, **variables) -> str:
if name not in self._cache:
path = self.prompts_dir / f"{name}.txt"
self._cache[name] = path.read_text()
template = Template(self._cache[name])
return template.safe_substitute(**variables)
# Usage
prompts = PromptManager()
prompt = prompts.get(
"summarize",
style="professional",
max_words=100,
focus_areas="key decisions and action items",
text=meeting_transcript
)
|
Testing LLM Features#
LLMs are non-deterministic. Test the structure, not exact outputs:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| import pytest
def test_review_analysis_structure():
"""Test that analysis returns valid structure."""
result = analyze_review("Great product, fast shipping!")
assert result.sentiment in ["positive", "negative", "neutral"]
assert 0.0 <= result.score <= 1.0
assert len(result.summary) > 0
assert isinstance(result.key_points, list)
def test_review_analysis_sentiment_direction():
"""Test that sentiment roughly matches content."""
positive = analyze_review("Amazing! Best purchase ever! Highly recommend!")
negative = analyze_review("Terrible. Broke immediately. Want refund.")
assert positive.score > 0.5
assert negative.score < 0.5
|
For deterministic tests, mock the LLM:
1
2
3
4
5
6
| from unittest.mock import patch
def test_categorization_with_mock():
with patch.object(llm, 'complete', return_value="billing"):
result = categorize_ticket("I need a refund")
assert result == "billing"
|
Key Takeaways#
- Always retry — Rate limits and transient errors are normal
- Structure outputs — Use JSON schemas or tool use for reliable parsing
- Control costs — Cache, budget, and monitor usage
- Degrade gracefully — Have fallbacks when LLM fails
- Stream for UX — Users prefer seeing progress
- Version prompts — Treat them like code
- Test structure — Don’t assert on exact LLM outputs
LLM APIs are powerful but unpredictable. These patterns help you build features that work reliably in production, not just in demos.