Resilience

Retry Patterns: When and How to Try Again

Not all failures are permanent. Retry patterns help distinguish transient hiccups from real problems. Exponential Backoff 1 2 3 4 5 6 7 8 9 10 11 12 13 14 import time import random def retry_with_backoff(func, max_retries=5, base_delay=1): for attempt in range(max_retries): try: return func() except Exception as e: if attempt == max_retries - 1: raise delay = base_delay * (2 ** attempt) jitter = random.uniform(0, delay * 0.1) time.sleep(delay + jitter) Each retry waits longer: 1s, 2s, 4s, 8s, 16s. Jitter prevents thundering herd. With tenacity 1 2 3 4 5 6 7 8 from tenacity import retry, stop_after_attempt, wait_exponential @retry( stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=60) ) def call_api(): return requests.get("https://api.example.com") Retry Only Transient Errors 1 2 3 4 5 6 7 8 from tenacity import retry, retry_if_exception_type @retry( retry=retry_if_exception_type((ConnectionError, TimeoutError)), stop=stop_after_attempt(3) ) def fetch_data(): return external_service.get() Don’t retry 400 Bad Request — that won’t fix itself. ...

Circuit Breakers: Fail Fast, Recover Gracefully

When a downstream service is failing, continuing to call it makes everything worse. Circuit breakers stop the cascade. The Pattern Three states: Closed: Normal operation, requests pass through Open: Service is failing, requests fail immediately Half-Open: Testing if service recovered [ C L ┌ │ ▼ O ▲ │ └ ─ S ─ ─ E ─ ─ D ─ ─ ] ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ f ─ ─ a ─ ─ i ─ ─ l s ─ u u ─ r c ─ e c ─ e ─ t s ─ h s ─ r ─ ─ e ─ ─ s ─ ─ h ─ ─ o ─ ─ l ─ ─ d ─ ─ ─ ─ ─ ─ ─ ─ ▶ ─ ─ ─ ─ [ ─ ─ O ─ ─ P │ │ ┴ ─ E ─ ─ N ─ ─ ] f ─ a ─ ─ i ─ ─ l ─ t u ─ i r ─ m e ─ e ─ ┐ │ │ o ─ u ─ t ─ ─ ─ ─ │ │ ┘ ▶ [ H A L F - O P E N ] Basic Implementation 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 import time from enum import Enum from threading import Lock class State(Enum): CLOSED = "closed" OPEN = "open" HALF_OPEN = "half_open" class CircuitBreaker: def __init__( self, failure_threshold: int = 5, recovery_timeout: int = 30, half_open_max_calls: int = 3 ): self.failure_threshold = failure_threshold self.recovery_timeout = recovery_timeout self.half_open_max_calls = half_open_max_calls self.state = State.CLOSED self.failure_count = 0 self.success_count = 0 self.last_failure_time = None self.lock = Lock() def can_execute(self) -> bool: with self.lock: if self.state == State.CLOSED: return True if self.state == State.OPEN: if time.time() - self.last_failure_time > self.recovery_timeout: self.state = State.HALF_OPEN self.success_count = 0 return True return False if self.state == State.HALF_OPEN: return self.success_count < self.half_open_max_calls return False def record_success(self): with self.lock: if self.state == State.HALF_OPEN: self.success_count += 1 if self.success_count >= self.half_open_max_calls: self.state = State.CLOSED self.failure_count = 0 else: self.failure_count = 0 def record_failure(self): with self.lock: self.failure_count += 1 self.last_failure_time = time.time() if self.state == State.HALF_OPEN: self.state = State.OPEN elif self.failure_count >= self.failure_threshold: self.state = State.OPEN Using the Circuit Breaker 1 2 3 4 5 6 7 8 9 10 11 12 13 payment_breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=60) def process_payment(order): if not payment_breaker.can_execute(): raise ServiceUnavailable("Payment service circuit open") try: result = payment_service.charge(order) payment_breaker.record_success() return result except Exception as e: payment_breaker.record_failure() raise Decorator Pattern 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 from functools import wraps def circuit_breaker(breaker: CircuitBreaker): def decorator(func): @wraps(func) def wrapper(*args, **kwargs): if not breaker.can_execute(): raise CircuitOpenError(f"Circuit breaker open for {func.__name__}") try: result = func(*args, **kwargs) breaker.record_success() return result except Exception as e: breaker.record_failure() raise return wrapper return decorator # Usage payment_cb = CircuitBreaker() @circuit_breaker(payment_cb) def charge_customer(customer_id, amount): return payment_api.charge(customer_id, amount) With Fallback 1 2 3 4 5 6 7 8 9 10 11 12 def get_user_recommendations(user_id): if not recommendations_breaker.can_execute(): # Fallback to cached or default recommendations return get_cached_recommendations(user_id) or DEFAULT_RECOMMENDATIONS try: result = recommendations_service.get(user_id) recommendations_breaker.record_success() return result except Exception: recommendations_breaker.record_failure() return get_cached_recommendations(user_id) or DEFAULT_RECOMMENDATIONS Library: pybreaker 1 2 3 4 5 6 7 8 9 10 11 12 13 import pybreaker db_breaker = pybreaker.CircuitBreaker( fail_max=5, reset_timeout=30 ) @db_breaker def query_database(sql): return db.execute(sql) # Check state print(db_breaker.current_state) # 'closed', 'open', or 'half-open' Library: tenacity (with circuit breaker) 1 2 3 4 5 6 7 8 from tenacity import retry, stop_after_attempt, CircuitBreaker cb = CircuitBreaker(failure_threshold=3, recovery_time=60) @retry(stop=stop_after_attempt(3)) @cb def call_external_api(): return requests.get("https://api.example.com/data") Per-Service Breakers 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 class ServiceRegistry: def __init__(self): self.breakers = {} def get_breaker(self, service_name: str) -> CircuitBreaker: if service_name not in self.breakers: self.breakers[service_name] = CircuitBreaker() return self.breakers[service_name] registry = ServiceRegistry() def call_service(service_name: str, endpoint: str): breaker = registry.get_breaker(service_name) if not breaker.can_execute(): raise ServiceUnavailable(f"{service_name} circuit is open") try: result = http_client.get(f"http://{service_name}/{endpoint}") breaker.record_success() return result except Exception: breaker.record_failure() raise Monitoring 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 from prometheus_client import Counter, Gauge circuit_state = Gauge( 'circuit_breaker_state', 'Circuit breaker state (0=closed, 1=open, 2=half-open)', ['service'] ) circuit_failures = Counter( 'circuit_breaker_failures_total', 'Circuit breaker failure count', ['service'] ) circuit_rejections = Counter( 'circuit_breaker_rejections_total', 'Requests rejected by open circuit', ['service'] ) # Update metrics in circuit breaker def record_failure(self, service_name): circuit_failures.labels(service=service_name).inc() # ... rest of failure logic circuit_state.labels(service=service_name).set(self.state.value) Configuration Guidelines Scenario Threshold Timeout Critical service, fast recovery 3-5 failures 15-30s Non-critical, can wait 5-10 failures 60-120s Flaky external API 3 failures 30-60s Database 5 failures 30s Anti-Patterns 1. Single global breaker ...

Circuit Breaker Patterns: Failing Fast Without Failing Hard

Your payment service is down. Every request to it times out after 30 seconds. You have 100 requests per second hitting that endpoint. Do the math: within a minute, you’ve got 6,000 threads waiting on a dead service, and your entire application is choking. This is where circuit breakers earn their keep. The Problem: Cascading Failures In distributed systems, a single failing dependency can take down everything. Without protection, your system will: ...

Circuit Breaker Pattern: Failing Fast to Stay Resilient

Learn how circuit breakers prevent cascade failures in distributed systems by detecting failures early and failing fast instead of waiting for timeouts.

Retry Patterns: Exponential Backoff and Beyond

Networks fail. Services go down. Databases get overwhelmed. The question isn’t whether your requests will fail—it’s how gracefully you handle it when they do. Naive retry logic can turn a minor hiccup into a catastrophic cascade. Smart retry logic can make your system resilient to transient failures. The difference is in the details. The Naive Approach (Don’t Do This) 1 2 3 4 5 6 7 8 9 # Bad: Immediate retry loop def fetch_data(url): for attempt in range(5): try: response = requests.get(url, timeout=5) return response.json() except requests.RequestException: continue raise Exception("Failed after 5 attempts") This code has several problems: ...

Chaos Engineering: Breaking Your Systems to Make Them Stronger

You don’t know if your system handles failures gracefully until failures happen. Chaos engineering lets you find out on your terms—in controlled conditions, during business hours, with engineers ready to respond. The Principles Define steady state — What does “healthy” look like? Hypothesize — “The system will continue serving traffic if one pod dies” Inject failure — Kill the pod Observe — Did steady state hold? Learn — Fix what broke, update runbooks Start Simple: Kill Things Random Pod Termination 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 # chaos_pod_killer.py import random from kubernetes import client, config from datetime import datetime import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ChaosPodKiller: def __init__(self, namespace: str, label_selector: str): config.load_incluster_config() # or load_kube_config() for local self.v1 = client.CoreV1Api() self.namespace = namespace self.label_selector = label_selector def get_targets(self) -> list: """Get pods matching selector.""" pods = self.v1.list_namespaced_pod( namespace=self.namespace, label_selector=self.label_selector ) return [pod.metadata.name for pod in pods.items if pod.status.phase == "Running"] def kill_random_pod(self, dry_run: bool = True) -> str: """Kill a random pod from targets.""" targets = self.get_targets() if len(targets) <= 1: logger.warning("Only one pod running, skipping kill") return None victim = random.choice(targets) if dry_run: logger.info(f"DRY RUN: Would delete pod {victim}") else: logger.info(f"Deleting pod {victim}") self.v1.delete_namespaced_pod( name=victim, namespace=self.namespace, grace_period_seconds=0 ) return victim def run_experiment(self, duration_minutes: int = 30, interval_seconds: int = 300): """Run chaos experiment for duration.""" import time end_time = datetime.now().timestamp() + (duration_minutes * 60) logger.info(f"Starting chaos experiment for {duration_minutes} minutes") while datetime.now().timestamp() < end_time: victim = self.kill_random_pod(dry_run=False) if victim: logger.info(f"Killed {victim}, waiting {interval_seconds}s") time.sleep(interval_seconds) logger.info("Chaos experiment completed") # Usage if __name__ == "__main__": killer = ChaosPodKiller( namespace="production", label_selector="app=api-service" ) killer.run_experiment(duration_minutes=30, interval_seconds=300) Kubernetes CronJob for Continuous Chaos 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 apiVersion: batch/v1 kind: CronJob metadata: name: chaos-pod-killer namespace: chaos-system spec: schedule: "*/10 9-17 * * 1-5" # Every 10 min, 9-5, weekdays only jobTemplate: spec: template: spec: serviceAccountName: chaos-runner containers: - name: chaos image: chaos-toolkit:latest command: - python - /scripts/chaos_pod_killer.py env: - name: TARGET_NAMESPACE value: "production" - name: LABEL_SELECTOR value: "chaos-enabled=true" restartPolicy: Never Network Chaos Introduce Latency 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 # Using Chaos Mesh apiVersion: chaos-mesh.org/v1alpha1 kind: NetworkChaos metadata: name: network-delay namespace: chaos-testing spec: action: delay mode: all selector: namespaces: - production labelSelectors: app: api-service delay: latency: "100ms" jitter: "20ms" correlation: "50" duration: "5m" scheduler: cron: "@every 1h" Packet Loss 1 2 3 4 5 6 7 8 9 10 11 12 13 14 apiVersion: chaos-mesh.org/v1alpha1 kind: NetworkChaos metadata: name: network-loss spec: action: loss mode: one selector: labelSelectors: app: payment-service loss: loss: "10" # 10% packet loss correlation: "50" duration: "2m" Network Partition 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 apiVersion: chaos-mesh.org/v1alpha1 kind: NetworkChaos metadata: name: network-partition spec: action: partition mode: all selector: namespaces: - production labelSelectors: app: api-service direction: both target: selector: namespaces: - production labelSelectors: app: database duration: "30s" Resource Stress CPU Stress 1 2 3 4 5 6 7 8 9 10 11 12 13 14 apiVersion: chaos-mesh.org/v1alpha1 kind: StressChaos metadata: name: cpu-stress spec: mode: one selector: labelSelectors: app: api-service stressors: cpu: workers: 2 load: 80 # 80% CPU usage duration: "5m" Memory Stress 1 2 3 4 5 6 7 8 9 10 11 12 13 14 apiVersion: chaos-mesh.org/v1alpha1 kind: StressChaos metadata: name: memory-stress spec: mode: one selector: labelSelectors: app: api-service stressors: memory: workers: 1 size: "512MB" # Allocate 512MB duration: "3m" Application-Level Chaos HTTP Fault Injection with Istio 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 apiVersion: networking.istio.io/v1beta1 kind: VirtualService metadata: name: api-chaos spec: hosts: - api-service http: - fault: abort: percentage: value: 5 httpStatus: 503 delay: percentage: value: 10 fixedDelay: 2s route: - destination: host: api-service Custom Failure Injection 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 # chaos_middleware.py import random import time import os from functools import wraps CHAOS_ENABLED = os.getenv("CHAOS_ENABLED", "false").lower() == "true" CHAOS_FAILURE_RATE = float(os.getenv("CHAOS_FAILURE_RATE", "0.0")) CHAOS_LATENCY_MS = int(os.getenv("CHAOS_LATENCY_MS", "0")) def chaos_middleware(f): """Inject chaos into function calls.""" @wraps(f) def wrapper(*args, **kwargs): if not CHAOS_ENABLED: return f(*args, **kwargs) # Random failure if random.random() < CHAOS_FAILURE_RATE: raise Exception("Chaos injection: random failure") # Random latency if CHAOS_LATENCY_MS > 0: delay = random.randint(0, CHAOS_LATENCY_MS) / 1000 time.sleep(delay) return f(*args, **kwargs) return wrapper # Usage @chaos_middleware def call_payment_service(order): return requests.post(PAYMENT_URL, json=order.to_dict()) Experiment Framework 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 # chaos_experiment.py from dataclasses import dataclass from typing import Callable, List import time @dataclass class SteadyStateCheck: name: str check: Callable[[], bool] @dataclass class ChaosExperiment: name: str hypothesis: str steady_state_checks: List[SteadyStateCheck] inject_failure: Callable[[], None] rollback: Callable[[], None] duration_seconds: int = 60 def run_experiment(experiment: ChaosExperiment) -> dict: """Run a chaos experiment with proper controls.""" result = { "name": experiment.name, "hypothesis": experiment.hypothesis, "success": False, "steady_state_before": {}, "steady_state_after": {}, "errors": [] } # 1. Verify steady state before print(f"Checking steady state before experiment...") for check in experiment.steady_state_checks: try: passed = check.check() result["steady_state_before"][check.name] = passed if not passed: result["errors"].append(f"Pre-check failed: {check.name}") return result except Exception as e: result["errors"].append(f"Pre-check error: {check.name}: {e}") return result # 2. Inject failure print(f"Injecting failure: {experiment.name}") try: experiment.inject_failure() except Exception as e: result["errors"].append(f"Injection failed: {e}") experiment.rollback() return result # 3. Wait for duration print(f"Waiting {experiment.duration_seconds}s...") time.sleep(experiment.duration_seconds) # 4. Check steady state during/after print(f"Checking steady state after experiment...") for check in experiment.steady_state_checks: try: passed = check.check() result["steady_state_after"][check.name] = passed except Exception as e: result["steady_state_after"][check.name] = False result["errors"].append(f"Post-check error: {check.name}: {e}") # 5. Rollback print(f"Rolling back...") experiment.rollback() # 6. Evaluate hypothesis result["success"] = all(result["steady_state_after"].values()) return result # Example experiment def check_api_responding(): response = requests.get("http://api-service/health", timeout=5) return response.status_code == 200 def check_error_rate_low(): # Query Prometheus result = prometheus.query('sum(rate(http_requests_total{status=~"5.."}[1m])) / sum(rate(http_requests_total[1m]))') return float(result) < 0.01 def kill_one_api_pod(): killer = ChaosPodKiller("production", "app=api-service") killer.kill_random_pod(dry_run=False) def noop_rollback(): pass # Kubernetes will restart the pod experiment = ChaosExperiment( name="api-pod-failure", hypothesis="API continues serving traffic when one pod dies", steady_state_checks=[ SteadyStateCheck("api_responding", check_api_responding), SteadyStateCheck("error_rate_low", check_error_rate_low), ], inject_failure=kill_one_api_pod, rollback=noop_rollback, duration_seconds=60 ) result = run_experiment(experiment) print(f"Experiment {'PASSED' if result['success'] else 'FAILED'}") Game Days Schedule regular chaos exercises: ...

Circuit Breakers: Building Systems That Fail Gracefully

In distributed systems, failures are inevitable. A single slow or failing service can cascade through your entire architecture, turning a minor issue into a major outage. Circuit breakers prevent this by detecting failures and stopping the cascade before it spreads. The Problem: Cascading Failures Imagine Service A calls Service B, which calls Service C. If Service C becomes slow: Requests to C start timing out Service B’s thread pool fills up waiting for C Service B becomes slow Service A’s threads fill up waiting for B Your entire system grinds to a halt One slow service just took down everything. ...