Chaos-Engineering

You don’t know if your system handles failures gracefully until failures happen. Chaos engineering lets you find out on your terms—in controlled conditions, during business hours, with engineers ready to respond. The Principles Define steady state — What does “healthy” look like? Hypothesize — “The system will continue serving traffic if one pod dies” Inject failure — Kill the pod Observe — Did steady state hold? Learn — Fix what broke, update runbooks Start Simple: Kill Things Random Pod Termination 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 # chaos_pod_killer.py import random from kubernetes import client, config from datetime import datetime import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ChaosPodKiller: def __init__(self, namespace: str, label_selector: str): config.load_incluster_config() # or load_kube_config() for local self.v1 = client.CoreV1Api() self.namespace = namespace self.label_selector = label_selector def get_targets(self) -> list: """Get pods matching selector.""" pods = self.v1.list_namespaced_pod( namespace=self.namespace, label_selector=self.label_selector ) return [pod.metadata.name for pod in pods.items if pod.status.phase == "Running"] def kill_random_pod(self, dry_run: bool = True) -> str: """Kill a random pod from targets.""" targets = self.get_targets() if len(targets) <= 1: logger.warning("Only one pod running, skipping kill") return None victim = random.choice(targets) if dry_run: logger.info(f"DRY RUN: Would delete pod {victim}") else: logger.info(f"Deleting pod {victim}") self.v1.delete_namespaced_pod( name=victim, namespace=self.namespace, grace_period_seconds=0 ) return victim def run_experiment(self, duration_minutes: int = 30, interval_seconds: int = 300): """Run chaos experiment for duration.""" import time end_time = datetime.now().timestamp() + (duration_minutes * 60) logger.info(f"Starting chaos experiment for {duration_minutes} minutes") while datetime.now().timestamp() < end_time: victim = self.kill_random_pod(dry_run=False) if victim: logger.info(f"Killed {victim}, waiting {interval_seconds}s") time.sleep(interval_seconds) logger.info("Chaos experiment completed") # Usage if __name__ == "__main__": killer = ChaosPodKiller( namespace="production", label_selector="app=api-service" ) killer.run_experiment(duration_minutes=30, interval_seconds=300) Kubernetes CronJob for Continuous Chaos 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 apiVersion: batch/v1 kind: CronJob metadata: name: chaos-pod-killer namespace: chaos-system spec: schedule: "*/10 9-17 * * 1-5" # Every 10 min, 9-5, weekdays only jobTemplate: spec: template: spec: serviceAccountName: chaos-runner containers: - name: chaos image: chaos-toolkit:latest command: - python - /scripts/chaos_pod_killer.py env: - name: TARGET_NAMESPACE value: "production" - name: LABEL_SELECTOR value: "chaos-enabled=true" restartPolicy: Never Network Chaos Introduce Latency 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 # Using Chaos Mesh apiVersion: chaos-mesh.org/v1alpha1 kind: NetworkChaos metadata: name: network-delay namespace: chaos-testing spec: action: delay mode: all selector: namespaces: - production labelSelectors: app: api-service delay: latency: "100ms" jitter: "20ms" correlation: "50" duration: "5m" scheduler: cron: "@every 1h" Packet Loss 1 2 3 4 5 6 7 8 9 10 11 12 13 14 apiVersion: chaos-mesh.org/v1alpha1 kind: NetworkChaos metadata: name: network-loss spec: action: loss mode: one selector: labelSelectors: app: payment-service loss: loss: "10" # 10% packet loss correlation: "50" duration: "2m" Network Partition 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 apiVersion: chaos-mesh.org/v1alpha1 kind: NetworkChaos metadata: name: network-partition spec: action: partition mode: all selector: namespaces: - production labelSelectors: app: api-service direction: both target: selector: namespaces: - production labelSelectors: app: database duration: "30s" Resource Stress CPU Stress 1 2 3 4 5 6 7 8 9 10 11 12 13 14 apiVersion: chaos-mesh.org/v1alpha1 kind: StressChaos metadata: name: cpu-stress spec: mode: one selector: labelSelectors: app: api-service stressors: cpu: workers: 2 load: 80 # 80% CPU usage duration: "5m" Memory Stress 1 2 3 4 5 6 7 8 9 10 11 12 13 14 apiVersion: chaos-mesh.org/v1alpha1 kind: StressChaos metadata: name: memory-stress spec: mode: one selector: labelSelectors: app: api-service stressors: memory: workers: 1 size: "512MB" # Allocate 512MB duration: "3m" Application-Level Chaos HTTP Fault Injection with Istio 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 apiVersion: networking.istio.io/v1beta1 kind: VirtualService metadata: name: api-chaos spec: hosts: - api-service http: - fault: abort: percentage: value: 5 httpStatus: 503 delay: percentage: value: 10 fixedDelay: 2s route: - destination: host: api-service Custom Failure Injection 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 # chaos_middleware.py import random import time import os from functools import wraps CHAOS_ENABLED = os.getenv("CHAOS_ENABLED", "false").lower() == "true" CHAOS_FAILURE_RATE = float(os.getenv("CHAOS_FAILURE_RATE", "0.0")) CHAOS_LATENCY_MS = int(os.getenv("CHAOS_LATENCY_MS", "0")) def chaos_middleware(f): """Inject chaos into function calls.""" @wraps(f) def wrapper(*args, **kwargs): if not CHAOS_ENABLED: return f(*args, **kwargs) # Random failure if random.random() < CHAOS_FAILURE_RATE: raise Exception("Chaos injection: random failure") # Random latency if CHAOS_LATENCY_MS > 0: delay = random.randint(0, CHAOS_LATENCY_MS) / 1000 time.sleep(delay) return f(*args, **kwargs) return wrapper # Usage @chaos_middleware def call_payment_service(order): return requests.post(PAYMENT_URL, json=order.to_dict()) Experiment Framework 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 # chaos_experiment.py from dataclasses import dataclass from typing import Callable, List import time @dataclass class SteadyStateCheck: name: str check: Callable[[], bool] @dataclass class ChaosExperiment: name: str hypothesis: str steady_state_checks: List[SteadyStateCheck] inject_failure: Callable[[], None] rollback: Callable[[], None] duration_seconds: int = 60 def run_experiment(experiment: ChaosExperiment) -> dict: """Run a chaos experiment with proper controls.""" result = { "name": experiment.name, "hypothesis": experiment.hypothesis, "success": False, "steady_state_before": {}, "steady_state_after": {}, "errors": [] } # 1. Verify steady state before print(f"Checking steady state before experiment...") for check in experiment.steady_state_checks: try: passed = check.check() result["steady_state_before"][check.name] = passed if not passed: result["errors"].append(f"Pre-check failed: {check.name}") return result except Exception as e: result["errors"].append(f"Pre-check error: {check.name}: {e}") return result # 2. Inject failure print(f"Injecting failure: {experiment.name}") try: experiment.inject_failure() except Exception as e: result["errors"].append(f"Injection failed: {e}") experiment.rollback() return result # 3. Wait for duration print(f"Waiting {experiment.duration_seconds}s...") time.sleep(experiment.duration_seconds) # 4. Check steady state during/after print(f"Checking steady state after experiment...") for check in experiment.steady_state_checks: try: passed = check.check() result["steady_state_after"][check.name] = passed except Exception as e: result["steady_state_after"][check.name] = False result["errors"].append(f"Post-check error: {check.name}: {e}") # 5. Rollback print(f"Rolling back...") experiment.rollback() # 6. Evaluate hypothesis result["success"] = all(result["steady_state_after"].values()) return result # Example experiment def check_api_responding(): response = requests.get("http://api-service/health", timeout=5) return response.status_code == 200 def check_error_rate_low(): # Query Prometheus result = prometheus.query('sum(rate(http_requests_total{status=~"5.."}[1m])) / sum(rate(http_requests_total[1m]))') return float(result) < 0.01 def kill_one_api_pod(): killer = ChaosPodKiller("production", "app=api-service") killer.kill_random_pod(dry_run=False) def noop_rollback(): pass # Kubernetes will restart the pod experiment = ChaosExperiment( name="api-pod-failure", hypothesis="API continues serving traffic when one pod dies", steady_state_checks=[ SteadyStateCheck("api_responding", check_api_responding), SteadyStateCheck("error_rate_low", check_error_rate_low), ], inject_failure=kill_one_api_pod, rollback=noop_rollback, duration_seconds=60 ) result = run_experiment(experiment) print(f"Experiment {'PASSED' if result['success'] else 'FAILED'}") Game Days Schedule regular chaos exercises: ...