LLM APIs are deceptively simple: send a prompt, get text back. But building reliable AI features requires handling rate limits, managing costs, structuring outputs, and gracefully degrading when things go wrong.

Here are the patterns that work in production.

The Basic Client

Start with a wrapper that handles common concerns:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
import time
from typing import Optional
import anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

class LLMClient:
    def __init__(self):
        self.client = anthropic.Anthropic()
        self.default_model = "claude-sonnet-4-20250514"
        self.max_tokens = 4096
    
    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=4, max=60)
    )
    def complete(
        self, 
        prompt: str,
        system: Optional[str] = None,
        model: Optional[str] = None,
        max_tokens: Optional[int] = None
    ) -> str:
        messages = [{"role": "user", "content": prompt}]
        
        response = self.client.messages.create(
            model=model or self.default_model,
            max_tokens=max_tokens or self.max_tokens,
            system=system or "",
            messages=messages
        )
        
        return response.content[0].text

The tenacity library handles retries with exponential backoff — essential for rate limits and transient errors.

Structured Outputs

Raw text is hard to work with. Force structured responses:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import json
from pydantic import BaseModel, ValidationError

class ProductReview(BaseModel):
    sentiment: str  # positive, negative, neutral
    score: float    # 0.0 to 1.0
    summary: str
    key_points: list[str]

def analyze_review(review_text: str) -> ProductReview:
    prompt = f"""Analyze this product review and respond with JSON only:

Review: {review_text}

Respond with this exact JSON structure:
{{
    "sentiment": "positive|negative|neutral",
    "score": 0.0-1.0,
    "summary": "one sentence summary",
    "key_points": ["point 1", "point 2"]
}}"""

    response = llm.complete(
        prompt=prompt,
        system="You are a JSON-only response bot. Output valid JSON, nothing else."
    )
    
    # Parse and validate
    try:
        data = json.loads(response)
        return ProductReview(**data)
    except (json.JSONDecodeError, ValidationError) as e:
        # Retry with stricter prompt or return default
        raise ValueError(f"Failed to parse LLM response: {e}")

For even more reliability, use Claude’s tool use feature to guarantee structured output:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
def analyze_review_with_tools(review_text: str) -> ProductReview:
    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=1024,
        tools=[{
            "name": "submit_analysis",
            "description": "Submit the review analysis",
            "input_schema": ProductReview.model_json_schema()
        }],
        messages=[{
            "role": "user",
            "content": f"Analyze this review: {review_text}"
        }]
    )
    
    # Extract tool call
    for block in response.content:
        if block.type == "tool_use":
            return ProductReview(**block.input)

Cost Control

LLM calls add up fast. Implement guards:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from functools import lru_cache
import hashlib

class CostAwareLLM:
    def __init__(self, daily_budget_usd: float = 10.0):
        self.daily_budget = daily_budget_usd
        self.daily_spend = 0.0
        self.cost_per_1k_input = 0.003   # Adjust per model
        self.cost_per_1k_output = 0.015
    
    def estimate_cost(self, input_tokens: int, output_tokens: int) -> float:
        return (
            (input_tokens / 1000) * self.cost_per_1k_input +
            (output_tokens / 1000) * self.cost_per_1k_output
        )
    
    def complete(self, prompt: str, **kwargs) -> str:
        # Check budget
        if self.daily_spend >= self.daily_budget:
            raise BudgetExceededError("Daily LLM budget exceeded")
        
        response = self._call_api(prompt, **kwargs)
        
        # Track spend
        cost = self.estimate_cost(
            response.usage.input_tokens,
            response.usage.output_tokens
        )
        self.daily_spend += cost
        
        return response.content[0].text
    
    @lru_cache(maxsize=1000)
    def complete_cached(self, prompt: str, **kwargs) -> str:
        """Cache identical prompts to avoid redundant calls."""
        return self.complete(prompt, **kwargs)

Graceful Degradation

When the LLM fails, have a fallback:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def categorize_ticket(ticket_text: str) -> str:
    """Categorize support ticket with LLM, fall back to keywords."""
    
    try:
        result = llm.complete(
            prompt=f"Categorize this support ticket into one of: billing, technical, general\n\nTicket: {ticket_text}",
            max_tokens=50
        )
        category = result.strip().lower()
        if category in ["billing", "technical", "general"]:
            return category
    except Exception as e:
        logger.warning(f"LLM categorization failed: {e}")
    
    # Keyword fallback
    text_lower = ticket_text.lower()
    if any(w in text_lower for w in ["invoice", "charge", "payment", "refund"]):
        return "billing"
    elif any(w in text_lower for w in ["error", "bug", "crash", "not working"]):
        return "technical"
    
    return "general"

Streaming for UX

For user-facing features, stream responses:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
async def stream_response(prompt: str):
    """Stream LLM response for better perceived latency."""
    
    with client.messages.stream(
        model="claude-sonnet-4-20250514",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    ) as stream:
        for text in stream.text_stream:
            yield text

In a web framework:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from fastapi import FastAPI
from fastapi.responses import StreamingResponse

app = FastAPI()

@app.get("/chat")
async def chat(prompt: str):
    return StreamingResponse(
        stream_response(prompt),
        media_type="text/event-stream"
    )

Prompt Management

Don’t hardcode prompts. Version and manage them:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# prompts/summarize.txt
"""
Summarize the following text in {{style}} style.
Keep it under {{max_words}} words.
Focus on: {{focus_areas}}

Text: {{text}}
"""

# prompt_manager.py
from pathlib import Path
from string import Template

class PromptManager:
    def __init__(self, prompts_dir: str = "prompts"):
        self.prompts_dir = Path(prompts_dir)
        self._cache = {}
    
    def get(self, name: str, **variables) -> str:
        if name not in self._cache:
            path = self.prompts_dir / f"{name}.txt"
            self._cache[name] = path.read_text()
        
        template = Template(self._cache[name])
        return template.safe_substitute(**variables)

# Usage
prompts = PromptManager()
prompt = prompts.get(
    "summarize",
    style="professional",
    max_words=100,
    focus_areas="key decisions and action items",
    text=meeting_transcript
)

Testing LLM Features

LLMs are non-deterministic. Test the structure, not exact outputs:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
import pytest

def test_review_analysis_structure():
    """Test that analysis returns valid structure."""
    result = analyze_review("Great product, fast shipping!")
    
    assert result.sentiment in ["positive", "negative", "neutral"]
    assert 0.0 <= result.score <= 1.0
    assert len(result.summary) > 0
    assert isinstance(result.key_points, list)

def test_review_analysis_sentiment_direction():
    """Test that sentiment roughly matches content."""
    positive = analyze_review("Amazing! Best purchase ever! Highly recommend!")
    negative = analyze_review("Terrible. Broke immediately. Want refund.")
    
    assert positive.score > 0.5
    assert negative.score < 0.5

For deterministic tests, mock the LLM:

1
2
3
4
5
6
from unittest.mock import patch

def test_categorization_with_mock():
    with patch.object(llm, 'complete', return_value="billing"):
        result = categorize_ticket("I need a refund")
        assert result == "billing"

Key Takeaways

  1. Always retry — Rate limits and transient errors are normal
  2. Structure outputs — Use JSON schemas or tool use for reliable parsing
  3. Control costs — Cache, budget, and monitor usage
  4. Degrade gracefully — Have fallbacks when LLM fails
  5. Stream for UX — Users prefer seeing progress
  6. Version prompts — Treat them like code
  7. Test structure — Don’t assert on exact LLM outputs

LLM APIs are powerful but unpredictable. These patterns help you build features that work reliably in production, not just in demos.