1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
| # chaos_experiment.py
from dataclasses import dataclass
from typing import Callable, List
import time
@dataclass
class SteadyStateCheck:
name: str
check: Callable[[], bool]
@dataclass
class ChaosExperiment:
name: str
hypothesis: str
steady_state_checks: List[SteadyStateCheck]
inject_failure: Callable[[], None]
rollback: Callable[[], None]
duration_seconds: int = 60
def run_experiment(experiment: ChaosExperiment) -> dict:
"""Run a chaos experiment with proper controls."""
result = {
"name": experiment.name,
"hypothesis": experiment.hypothesis,
"success": False,
"steady_state_before": {},
"steady_state_after": {},
"errors": []
}
# 1. Verify steady state before
print(f"Checking steady state before experiment...")
for check in experiment.steady_state_checks:
try:
passed = check.check()
result["steady_state_before"][check.name] = passed
if not passed:
result["errors"].append(f"Pre-check failed: {check.name}")
return result
except Exception as e:
result["errors"].append(f"Pre-check error: {check.name}: {e}")
return result
# 2. Inject failure
print(f"Injecting failure: {experiment.name}")
try:
experiment.inject_failure()
except Exception as e:
result["errors"].append(f"Injection failed: {e}")
experiment.rollback()
return result
# 3. Wait for duration
print(f"Waiting {experiment.duration_seconds}s...")
time.sleep(experiment.duration_seconds)
# 4. Check steady state during/after
print(f"Checking steady state after experiment...")
for check in experiment.steady_state_checks:
try:
passed = check.check()
result["steady_state_after"][check.name] = passed
except Exception as e:
result["steady_state_after"][check.name] = False
result["errors"].append(f"Post-check error: {check.name}: {e}")
# 5. Rollback
print(f"Rolling back...")
experiment.rollback()
# 6. Evaluate hypothesis
result["success"] = all(result["steady_state_after"].values())
return result
# Example experiment
def check_api_responding():
response = requests.get("http://api-service/health", timeout=5)
return response.status_code == 200
def check_error_rate_low():
# Query Prometheus
result = prometheus.query('sum(rate(http_requests_total{status=~"5.."}[1m])) / sum(rate(http_requests_total[1m]))')
return float(result) < 0.01
def kill_one_api_pod():
killer = ChaosPodKiller("production", "app=api-service")
killer.kill_random_pod(dry_run=False)
def noop_rollback():
pass # Kubernetes will restart the pod
experiment = ChaosExperiment(
name="api-pod-failure",
hypothesis="API continues serving traffic when one pod dies",
steady_state_checks=[
SteadyStateCheck("api_responding", check_api_responding),
SteadyStateCheck("error_rate_low", check_error_rate_low),
],
inject_failure=kill_one_api_pod,
rollback=noop_rollback,
duration_seconds=60
)
result = run_experiment(experiment)
print(f"Experiment {'PASSED' if result['success'] else 'FAILED'}")
|